From bc7951c21990130ac9a21a9bffe04033d2f24feb Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 4 Feb 2021 14:24:55 -0800 Subject: [PATCH 1/2] switch data_select in dataset signature to split --- test/data/test_builtin_datasets.py | 26 +++--- test/experimental/test_with_asset.py | 4 +- .../datasets/language_modeling.py | 52 +++++------ .../experimental/datasets/question_answer.py | 26 +++--- .../datasets/raw/language_modeling.py | 54 ++++++------ .../datasets/raw/question_answer.py | 20 ++--- .../datasets/raw/sequence_tagging.py | 18 ++-- .../datasets/raw/text_classification.py | 62 ++++++------- .../experimental/datasets/raw/translation.py | 24 +++--- .../experimental/datasets/sequence_tagging.py | 26 +++--- .../datasets/text_classification.py | 86 +++++++++---------- .../experimental/datasets/translation.py | 34 ++++---- 12 files changed, 216 insertions(+), 216 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index fcef4f1507..435f083049 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -57,12 +57,12 @@ def test_wikitext2(self): self.assertEqual(tokens_ids, [2, 286, 503, 700]) # Add test for the subset of the standard datasets - train_iter, valid_iter, test_iter = torchtext.experimental.datasets.raw.WikiText2(data_select=('train', 'valid', 'test')) + train_iter, valid_iter, test_iter = torchtext.experimental.datasets.raw.WikiText2(split=('train', 'valid', 'test')) self._helper_test_func(len(train_iter), 36718, next(iter(train_iter)), ' \n') self._helper_test_func(len(valid_iter), 3760, next(iter(valid_iter)), ' \n') self._helper_test_func(len(test_iter), 4358, next(iter(test_iter)), ' \n') del train_iter, valid_iter, test_iter - train_dataset, test_dataset = WikiText2(data_select=('train', 'test')) + train_dataset, test_dataset = WikiText2(split=('train', 'test')) train_data = torch.cat(tuple(filter(lambda t: t.numel() > 0, train_dataset))) test_data = torch.cat(tuple(filter(lambda t: t.numel() > 0, test_dataset))) self._helper_test_func(len(train_data), 2049990, train_data[20:25], @@ -105,14 +105,14 @@ def test_penntreebank(self): self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) # Add test for the subset of the standard datasets - train_dataset, test_dataset = PennTreebank(data_select=('train', 'test')) + train_dataset, test_dataset = PennTreebank(split=('train', 'test')) train_data = torch.cat(tuple(filter(lambda t: t.numel() > 0, train_dataset))) test_data = torch.cat(tuple(filter(lambda t: t.numel() > 0, test_dataset))) self._helper_test_func(len(train_data), 924412, train_data[20:25], [9919, 9920, 9921, 9922, 9188]) self._helper_test_func(len(test_data), 82114, test_data[30:35], [397, 93, 4, 16, 7]) - train_iter, test_iter = torchtext.experimental.datasets.raw.PennTreebank(data_select=('train', 'test')) + train_iter, test_iter = torchtext.experimental.datasets.raw.PennTreebank(split=('train', 'test')) self._helper_test_func(len(train_iter), 42068, next(iter(train_iter))[:15], ' aer banknote b') self._helper_test_func(len(test_iter), 3761, next(iter(test_iter))[:25], " no it was n't black mond") del train_iter, test_iter @@ -130,7 +130,7 @@ def test_text_classification(self): [2351, 758, 96, 38581, 2351, 220, 5, 396, 3, 14786]) # Add test for the subset of the standard datasets - train_dataset, = AG_NEWS(data_select=('train')) + train_dataset, = AG_NEWS(split=('train')) self._helper_test_func(len(train_dataset), 120000, train_dataset[-1][1][:10], [2155, 223, 2405, 30, 3010, 2204, 54, 3603, 4930, 2405]) train_iter, test_iter = torchtext.experimental.datasets.raw.AG_NEWS() @@ -154,7 +154,7 @@ def test_imdb(self): new_train_data, new_test_data = IMDB(vocab=new_vocab) # Add test for the subset of the standard datasets - train_dataset, = IMDB(data_select=('train')) + train_dataset, = IMDB(split=('train')) self._helper_test_func(len(train_dataset), 25000, train_dataset[0][1][:10], [13, 1568, 13, 246, 35468, 43, 64, 398, 1135, 92]) train_iter, test_iter = torchtext.experimental.datasets.raw.IMDB() @@ -225,7 +225,7 @@ def test_multi30k(self): [18, 24, 1168, 807, 16, 56, 83, 335, 1338]) # Add test for the subset of the standard datasets - train_iter, valid_iter = torchtext.experimental.datasets.raw.Multi30k(data_select=('train', 'valid')) + train_iter, valid_iter = torchtext.experimental.datasets.raw.Multi30k(split=('train', 'valid')) self._helper_test_func(len(train_iter), 29000, ' '.join(next(iter(train_iter))), ' '.join(['Zwei junge weiße Männer sind im Freien in der Nähe vieler Büsche.\n', 'Two young, White males are outside near many bushes.\n'])) @@ -233,7 +233,7 @@ def test_multi30k(self): ' '.join(['Eine Gruppe von Männern lädt Baumwolle auf einen Lastwagen\n', 'A group of men are loading cotton onto a truck\n'])) del train_iter, valid_iter - train_dataset, = Multi30k(data_select=('train')) + train_dataset, = Multi30k(split=('train')) self._helper_test_func(len(train_dataset), 29000, train_dataset[20], ([4, 444, 2531, 47, 17480, 7423, 8, 158, 10, 12, 5849, 3, 2], [5, 61, 530, 137, 1494, 10, 9, 280, 6, 2, 3749, 4, 3])) @@ -293,11 +293,11 @@ def test_udpos_sequence_tagging(self): self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) # Add test for the subset of the standard datasets - train_dataset, = UDPOS(data_select=('train')) + train_dataset, = UDPOS(split=('train')) self._helper_test_func(len(train_dataset), 12543, (train_dataset[0][0][:10], train_dataset[-1][2][:10]), ([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585], [6, 20, 8, 10, 8, 8, 24, 13, 8, 15])) - train_iter, valid_iter = torchtext.experimental.datasets.raw.UDPOS(data_select=('train', 'valid')) + train_iter, valid_iter = torchtext.experimental.datasets.raw.UDPOS(split=('train', 'valid')) self._helper_test_func(len(train_iter), 12543, ' '.join(next(iter(train_iter))[0][:5]), ' '.join(['Al', '-', 'Zaman', ':', 'American'])) self._helper_test_func(len(valid_iter), 2002, ' '.join(next(iter(valid_iter))[0][:5]), @@ -340,7 +340,7 @@ def test_conll_sequence_tagging(self): self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) # Add test for the subset of the standard datasets - train_dataset, = CoNLL2000Chunking(data_select=('train')) + train_dataset, = CoNLL2000Chunking(split=('train')) self._helper_test_func(len(train_dataset), 8936, (train_dataset[0][0][:10], train_dataset[0][1][:10], train_dataset[0][2][:10], train_dataset[-1][0][:10], train_dataset[-1][1][:10], train_dataset[-1][2][:10]), @@ -375,7 +375,7 @@ def test_squad1(self): new_train_data, new_test_data = SQuAD1(vocab=new_vocab) # Add test for the subset of the standard datasets - train_dataset, = SQuAD1(data_select=('train')) + train_dataset, = SQuAD1(split=('train')) context, question, answers, ans_pos = train_dataset[100] self._helper_test_func(len(train_dataset), 87599, (question[:5], ans_pos[0]), ([7, 24, 86, 52, 2], [72, 72])) @@ -404,7 +404,7 @@ def test_squad2(self): new_train_data, new_test_data = SQuAD2(vocab=new_vocab) # Add test for the subset of the standard datasets - train_dataset, = SQuAD2(data_select=('train')) + train_dataset, = SQuAD2(split=('train')) context, question, answers, ans_pos = train_dataset[200] self._helper_test_func(len(train_dataset), 130319, (question[:5], ans_pos[0]), ([84, 50, 1421, 12, 5439], [9, 9])) diff --git a/test/experimental/test_with_asset.py b/test/experimental/test_with_asset.py index 146eec53ce..50c01935a9 100644 --- a/test/experimental/test_with_asset.py +++ b/test/experimental/test_with_asset.py @@ -62,10 +62,10 @@ def test_wikitext103(self): self.assertEqual(tokens_ids, [2, 320, 437, 687]) # Add test for the subset of the standard datasets - train_dataset, test_dataset = torchtext.experimental.datasets.raw.WikiText103(data_select=('train', 'test')) + train_dataset, test_dataset = torchtext.experimental.datasets.raw.WikiText103(split=('train', 'test')) self._helper_test_func(len(train_dataset), 1801350, next(iter(train_dataset)), ' \n') self._helper_test_func(len(test_dataset), 4358, next(iter(test_dataset)), ' \n') - train_dataset, test_dataset = WikiText103(vocab=builtin_vocab, data_select=('train', 'test')) + train_dataset, test_dataset = WikiText103(vocab=builtin_vocab, split=('train', 'test')) self._helper_test_func(len(train_dataset), 1801350, train_dataset[10][:5], [2, 69, 12, 14, 265]) self._helper_test_func(len(test_dataset), 4358, test_dataset[28][:5], diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 3350c3f3b0..6e50b27df1 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -58,19 +58,19 @@ def get_vocab(self): return self.vocab -def _setup_datasets(dataset_name, tokenizer, root, vocab, data_select, year, language): +def _setup_datasets(dataset_name, tokenizer, root, vocab, split, year, language): if tokenizer is None: tokenizer = get_tokenizer('basic_english') - data_select = check_default_set(data_select, ('train', 'test', 'valid')) + split = check_default_set(split, ('train', 'test', 'valid')) if vocab is None: - if 'train' not in data_select: + if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") if dataset_name == 'WMTNewsCrawl': - raw_train, = raw.DATASETS[dataset_name](root=root, data_select=('train',), year=year, language=language) + raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',), year=year, language=language) else: - raw_train, = raw.DATASETS[dataset_name](root=root, data_select=('train',)) + raw_train, = raw.DATASETS[dataset_name](root=root, split=('train',)) logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_train, tokenizer) logger_.info('Vocab has %d entries', len(vocab)) @@ -79,16 +79,16 @@ def text_transform(line): return torch.tensor([vocab[token] for token in tokenizer(line)], dtype=torch.long) if dataset_name == 'WMTNewsCrawl': - raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select, year=year, language=language) + raw_datasets = raw.DATASETS[dataset_name](root=root, split=split, year=year, language=language) else: - raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) - raw_data = {name: list(map(text_transform, raw_dataset)) for name, raw_dataset in zip(data_select, raw_datasets)} - logger_.info('Building datasets for {}'.format(data_select)) + raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) + raw_data = {name: list(map(text_transform, raw_dataset)) for name, raw_dataset in zip(split, raw_datasets)} + logger_.info('Building datasets for {}'.format(split)) return tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform) - for item in data_select) + for item in split) -def WikiText2(tokenizer=None, root='.data', vocab=None, data_select=('train', 'valid', 'test')): +def WikiText2(tokenizer=None, root='.data', vocab=None, split=('train', 'valid', 'test')): """ Defines WikiText2 datasets. Create language modeling dataset: WikiText2 @@ -102,7 +102,7 @@ def WikiText2(tokenizer=None, root='.data', vocab=None, data_select=('train', 'v root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - data_select: a string or tupel for the returned datasets. Default: ('train', 'valid','test') + split: a string or tupel for the returned datasets. Default: ('train', 'valid','test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -116,13 +116,13 @@ def WikiText2(tokenizer=None, root='.data', vocab=None, data_select=('train', 'v >>> train_dataset, valid_dataset, test_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, - data_select='valid') + split='valid') """ - return _setup_datasets("WikiText2", tokenizer, root, vocab, data_select, None, None) + return _setup_datasets("WikiText2", tokenizer, root, vocab, split, None, None) -def WikiText103(tokenizer=None, root='.data', vocab=None, data_select=('train', 'valid', 'test')): +def WikiText103(tokenizer=None, root='.data', vocab=None, split=('train', 'valid', 'test')): """ Defines WikiText103 datasets. Create language modeling dataset: WikiText103 @@ -136,7 +136,7 @@ def WikiText103(tokenizer=None, root='.data', vocab=None, data_select=('train', root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - data_select: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') + split: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -150,14 +150,14 @@ def WikiText103(tokenizer=None, root='.data', vocab=None, data_select=('train', >>> train_dataset, valid_dataset, test_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, - data_select='valid') + split='valid') """ - return _setup_datasets("WikiText103", tokenizer, root, vocab, data_select, None, None) + return _setup_datasets("WikiText103", tokenizer, root, vocab, split, None, None) -def PennTreebank(tokenizer=None, root='.data', vocab=None, data_select=('train', 'valid', 'test')): +def PennTreebank(tokenizer=None, root='.data', vocab=None, split=('train', 'valid', 'test')): """ Defines PennTreebank datasets. Create language modeling dataset: PennTreebank @@ -171,7 +171,7 @@ def PennTreebank(tokenizer=None, root='.data', vocab=None, data_select=('train', root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - data_select: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') + split: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -185,14 +185,14 @@ def PennTreebank(tokenizer=None, root='.data', vocab=None, data_select=('train', >>> train_dataset, valid_dataset, test_dataset = PennTreebank(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = PennTreebank(tokenizer=tokenizer, vocab=vocab, - data_select='valid') + split='valid') """ - return _setup_datasets("PennTreebank", tokenizer, root, vocab, data_select, None, None) + return _setup_datasets("PennTreebank", tokenizer, root, vocab, split, None, None) -def WMTNewsCrawl(tokenizer=None, root='.data', vocab=None, data_select=('train'), year=2010, language='en'): +def WMTNewsCrawl(tokenizer=None, root='.data', vocab=None, split=('train'), year=2010, language='en'): """ Defines WMTNewsCrawl datasets. Create language modeling dataset: WMTNewsCrawl @@ -206,7 +206,7 @@ def WMTNewsCrawl(tokenizer=None, root='.data', vocab=None, data_select=('train') root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train',)) year: the year of the dataset (Default: 2010) language: the language of the dataset (Default: 'en') @@ -215,12 +215,12 @@ def WMTNewsCrawl(tokenizer=None, root='.data', vocab=None, data_select=('train') >>> from torchtext.experimental.datasets import WMTNewsCrawl >>> from torchtext.data.utils import get_tokenizer >>> tokenizer = get_tokenizer("spacy") - >>> train_dataset, = WMTNewsCrawl(tokenizer=tokenizer, data_select='train') + >>> train_dataset, = WMTNewsCrawl(tokenizer=tokenizer, split='train') Note: WMTNewsCrawl provides datasets based on the year and language instead of train/valid/test. """ - return _setup_datasets("WMTNewsCrawl", tokenizer, root, vocab, data_select, year, language) + return _setup_datasets("WMTNewsCrawl", tokenizer, root, vocab, split, year, language) DATASETS = { diff --git a/torchtext/experimental/datasets/question_answer.py b/torchtext/experimental/datasets/question_answer.py index ec239deb97..f24551c5a4 100644 --- a/torchtext/experimental/datasets/question_answer.py +++ b/torchtext/experimental/datasets/question_answer.py @@ -61,16 +61,16 @@ def get_vocab(self): return self.vocab -def _setup_datasets(dataset_name, root, vocab, tokenizer, data_select): +def _setup_datasets(dataset_name, root, vocab, tokenizer, split): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) - data_select = check_default_set(data_select, ('train', 'dev')) - raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) - raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets)} + split = check_default_set(split, ('train', 'dev')) + raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) + raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets)} if vocab is None: - if 'train' not in data_select: + if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") def apply_transform(data): @@ -85,11 +85,11 @@ def apply_transform(data): text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) transforms = {'context': text_transform, 'question': text_transform, 'answers': text_transform, 'ans_pos': totensor(dtype=torch.long)} - logger_.info('Building datasets for {}'.format(data_select)) - return tuple(QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in data_select) + logger_.info('Building datasets for {}'.format(split)) + return tuple(QuestionAnswerDataset(raw_data[item], vocab, transforms) for item in split) -def SQuAD1(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev')): +def SQuAD1(root='.data', vocab=None, tokenizer=None, split=('train', 'dev')): """ Defines SQuAD1 datasets. Create question answer dataset: SQuAD1 @@ -104,7 +104,7 @@ def SQuAD1(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev' The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, all the two datasets (train, dev) are generated. Users could also choose any one of them, for example ('train', 'test') or @@ -120,10 +120,10 @@ def SQuAD1(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev' >>> train, dev = SQuAD1(tokenizer=tokenizer) """ - return _setup_datasets('SQuAD1', root, vocab, tokenizer, data_select) + return _setup_datasets('SQuAD1', root, vocab, tokenizer, split) -def SQuAD2(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev')): +def SQuAD2(root='.data', vocab=None, tokenizer=None, split=('train', 'dev')): """ Defines SQuAD2 datasets. Create question answer dataset: SQuAD2 @@ -138,7 +138,7 @@ def SQuAD2(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev' The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, all the two datasets (train, dev) are generated. Users could also choose any one of them, for example ('train', 'test') or @@ -154,7 +154,7 @@ def SQuAD2(root='.data', vocab=None, tokenizer=None, data_select=('train', 'dev' >>> train, dev = SQuAD2(tokenizer=tokenizer) """ - return _setup_datasets('SQuAD2', root, vocab, tokenizer, data_select) + return _setup_datasets('SQuAD2', root, vocab, tokenizer, split) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index b14c127316..b369a060b2 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -17,24 +17,24 @@ } -def _setup_datasets(dataset_name, root, data_select, year, language): - data_select = check_default_set(data_select, ('train', 'test', 'valid')) - if isinstance(data_select, str): - data_select = [data_select] - if not set(data_select).issubset(set(('train', 'test', 'valid'))): - raise TypeError('data_select is not supported!') +def _setup_datasets(dataset_name, root, split, year, language): + split = check_default_set(split, ('train', 'test', 'valid')) + if isinstance(split, str): + split = [split] + if not set(split).issubset(set(('train', 'test', 'valid'))): + raise TypeError('split is not supported!') if dataset_name == 'PennTreebank': extracted_files = [] select_to_index = {'train': 0, 'test': 1, 'valid': 2} extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], root=root, hash_value=MD5['PennTreebank'][key], - hash_type='md5') for key in data_select] + hash_type='md5') for key in split] elif dataset_name == 'WMTNewsCrawl': - if not (data_select == ['train'] or set(data_select).issubset(set(('train',)))): + if not (split == ['train'] or set(split).issubset(set(('train',)))): raise ValueError("WMTNewsCrawl only creates a training dataset. " - "data_select should be 'train' " - "or ('train',), got {}.".format(data_select)) + "split should be 'train' " + "or ('train',), got {}.".format(split)) dataset_tar = download_from_url(URLS[dataset_name], root=root, hash_value=MD5['WMTNewsCrawl'], hash_type='md5') extracted_files = extract_archive(dataset_tar) file_name = 'news.{}.{}.shuffled'.format(year, language) @@ -44,7 +44,7 @@ def _setup_datasets(dataset_name, root, data_select, year, language): extracted_files = extract_archive(dataset_tar) _path = {} - for item in data_select: + for item in split: for fname in extracted_files: if item in fname: _path[item] = fname @@ -55,10 +55,10 @@ def _setup_datasets(dataset_name, root, data_select, year, language): data[item] = iter(io.open(_path[item], encoding="utf8")) return tuple(RawTextIterableDataset(dataset_name, - NUM_LINES[dataset_name][item], data[item]) for item in data_select) + NUM_LINES[dataset_name][item], data[item]) for item in split) -def WikiText2(root='.data', data_select=('train', 'valid', 'test')): +def WikiText2(root='.data', split=('train', 'valid', 'test')): """ Defines WikiText2 datasets. Create language modeling dataset: WikiText2 @@ -66,7 +66,7 @@ def WikiText2(root='.data', data_select=('train', 'valid', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tupel for the returned datasets. Default: ('train', 'valid, 'test') + split: a string or tupel for the returned datasets. Default: ('train', 'valid, 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -76,14 +76,14 @@ def WikiText2(root='.data', data_select=('train', 'valid', 'test')): Examples: >>> from torchtext.experimental.raw.datasets import WikiText2 >>> train_dataset, valid_dataset, test_dataset = WikiText2() - >>> valid_dataset, = WikiText2(data_select='valid') + >>> valid_dataset, = WikiText2(split='valid') """ - return _setup_datasets("WikiText2", root, data_select, None, None) + return _setup_datasets("WikiText2", root, split, None, None) -def WikiText103(root='.data', data_select=('train', 'valid', 'test')): +def WikiText103(root='.data', split=('train', 'valid', 'test')): """ Defines WikiText103 datasets. Create language modeling dataset: WikiText103 @@ -91,7 +91,7 @@ def WikiText103(root='.data', data_select=('train', 'valid', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: the returned datasets. Default: ('train', 'valid','test') + split: the returned datasets. Default: ('train', 'valid','test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will @@ -100,13 +100,13 @@ def WikiText103(root='.data', data_select=('train', 'valid', 'test')): Examples: >>> from torchtext.experimental.datasets.raw import WikiText103 >>> train_dataset, valid_dataset, test_dataset = WikiText103() - >>> valid_dataset, = WikiText103(data_select='valid') + >>> valid_dataset, = WikiText103(split='valid') """ - return _setup_datasets("WikiText103", root, data_select, None, None) + return _setup_datasets("WikiText103", root, split, None, None) -def PennTreebank(root='.data', data_select=('train', 'valid', 'test')): +def PennTreebank(root='.data', split=('train', 'valid', 'test')): """ Defines PennTreebank datasets. Create language modeling dataset: PennTreebank @@ -114,7 +114,7 @@ def PennTreebank(root='.data', data_select=('train', 'valid', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets ('train', 'valid', 'test') are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -125,21 +125,21 @@ def PennTreebank(root='.data', data_select=('train', 'valid', 'test')): Examples: >>> from torchtext.experimental.datasets.raw import PennTreebank >>> train_dataset, valid_dataset, test_dataset = PennTreebank() - >>> valid_dataset, = PennTreebank(data_select='valid') + >>> valid_dataset, = PennTreebank(split='valid') """ - return _setup_datasets("PennTreebank", root, data_select, None, None) + return _setup_datasets("PennTreebank", root, split, None, None) -def WMTNewsCrawl(root='.data', data_select=('train'), year=2010, language='en'): +def WMTNewsCrawl(root='.data', split=('train'), year=2010, language='en'): """ Defines WMT News Crawl. Create language modeling dataset: WMTNewsCrawl Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. + split: a string or tuple for the returned datasets. (Default: 'train') year: the year of the dataset (Default: 2010) language: the language of the dataset (Default: 'en') @@ -147,7 +147,7 @@ def WMTNewsCrawl(root='.data', data_select=('train'), year=2010, language='en'): Note: WMTNewsCrawl provides datasets based on the year and language instead of train/valid/test. """ - return _setup_datasets("WMTNewsCrawl", root, data_select, year, language) + return _setup_datasets("WMTNewsCrawl", root, split, year, language) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/question_answer.py b/torchtext/experimental/datasets/raw/question_answer.py index d21dbdbc55..3bc4d83461 100644 --- a/torchtext/experimental/datasets/raw/question_answer.py +++ b/torchtext/experimental/datasets/raw/question_answer.py @@ -29,15 +29,15 @@ def _create_data_from_json(data_path): yield (_context, _question, _answers, _answer_start) -def _setup_datasets(dataset_name, root, data_select): - data_select = check_default_set(data_select, ('train', 'dev')) +def _setup_datasets(dataset_name, root, split): + split = check_default_set(split, ('train', 'dev')) extracted_files = {key: download_from_url(URLS[dataset_name][key], root=root, - hash_value=MD5[dataset_name][key], hash_type='md5') for key in data_select} + hash_value=MD5[dataset_name][key], hash_type='md5') for key in split} return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], - _create_data_from_json(extracted_files[item])) for item in data_select) + _create_data_from_json(extracted_files[item])) for item in split) -def SQuAD1(root='.data', data_select=('train', 'dev')): +def SQuAD1(root='.data', split=('train', 'dev')): """ A dataset iterator yields the data of Stanford Question Answering dataset - SQuAD1.0. The iterator yields a tuple of (raw context, raw question, a list of raw answer, a list of answer positions in the raw context). @@ -48,7 +48,7 @@ def SQuAD1(root='.data', data_select=('train', 'dev')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets (Default: ('train', 'dev')) + split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, both datasets (train, dev) are generated. Users could also choose any one or two of them, for example ('train', 'dev') or just a string 'train'. @@ -58,10 +58,10 @@ def SQuAD1(root='.data', data_select=('train', 'dev')): >>> print(idx, (context, question, answer, ans_pos)) """ - return _setup_datasets("SQuAD1", root, data_select) + return _setup_datasets("SQuAD1", root, split) -def SQuAD2(root='.data', data_select=('train', 'dev')): +def SQuAD2(root='.data', split=('train', 'dev')): """ A dataset iterator yields the data of Stanford Question Answering dataset - SQuAD2.0. The iterator yields a tuple of (raw context, raw question, a list of raw answer, a list of answer positions in the raw context). @@ -72,7 +72,7 @@ def SQuAD2(root='.data', data_select=('train', 'dev')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets (Default: ('train', 'dev')) + split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, both datasets (train, dev) are generated. Users could also choose any one or two of them, for example ('train', 'dev') or just a string 'train'. @@ -82,7 +82,7 @@ def SQuAD2(root='.data', data_select=('train', 'dev')): >>> print(idx, (context, question, answer, ans_pos)) """ - return _setup_datasets("SQuAD2", root, data_select) + return _setup_datasets("SQuAD2", root, split) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index c1a67261f1..b3f221a385 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -39,8 +39,8 @@ def _construct_filepath(paths, file_suffix): return None -def _setup_datasets(dataset_name, separator, root, data_select): - data_select = check_default_set(data_select, target_select=('train', 'valid', 'test')) +def _setup_datasets(dataset_name, separator, root, split): + split = check_default_set(split, target_select=('train', 'valid', 'test')) extracted_files = [] if isinstance(URLS[dataset_name], dict): for name, item in URLS[dataset_name].items(): @@ -61,17 +61,17 @@ def _setup_datasets(dataset_name, separator, root, data_select): } return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], _create_data_from_iob(data_filenames[item], separator)) - if data_filenames[item] is not None else None for item in data_select) + if data_filenames[item] is not None else None for item in split) -def UDPOS(root=".data", data_select=('train', 'valid', 'test')): +def UDPOS(root=".data", split=('train', 'valid', 'test')): """ Universal Dependencies English Web Treebank Separately returns the training and test dataset Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) + split: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'valid', 'test') or just a string 'train'. @@ -80,17 +80,17 @@ def UDPOS(root=".data", data_select=('train', 'valid', 'test')): >>> from torchtext.experimental.datasets.raw import UDPOS >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ - return _setup_datasets("UDPOS", "\t", root, data_select) + return _setup_datasets("UDPOS", "\t", root, split) -def CoNLL2000Chunking(root=".data", data_select=('train', 'test')): +def CoNLL2000Chunking(root=".data", split=('train', 'test')): """ CoNLL 2000 Chunking Dataset Separately returns the training and test dataset Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets (Default: ('train', 'test')) + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -98,7 +98,7 @@ def CoNLL2000Chunking(root=".data", data_select=('train', 'test')): >>> from torchtext.experimental.datasets.raw import CoNLL2000Chunking >>> train_dataset, test_dataset = CoNLL2000Chunking() """ - return _setup_datasets("CoNLL2000Chunking", " ", root, data_select) + return _setup_datasets("CoNLL2000Chunking", " ", root, split) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/text_classification.py b/torchtext/experimental/datasets/raw/text_classification.py index 694e4d5d29..4a1706714c 100644 --- a/torchtext/experimental/datasets/raw/text_classification.py +++ b/torchtext/experimental/datasets/raw/text_classification.py @@ -33,8 +33,8 @@ def _create_data_from_csv(data_path): yield int(row[0]), ' '.join(row[1:]) -def _setup_datasets(dataset_name, root, data_select): - data_select = check_default_set(data_select, target_select=('train', 'test')) +def _setup_datasets(dataset_name, root, split): + split = check_default_set(split, target_select=('train', 'test')) if dataset_name == 'AG_NEWS': extracted_files = [download_from_url(URLS[dataset_name][item], root=root, hash_value=MD5['AG_NEWS'][item], @@ -51,10 +51,10 @@ def _setup_datasets(dataset_name, root, data_select): if fname.endswith('test.csv'): cvs_path['test'] = fname return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], - _create_data_from_csv(cvs_path[item])) for item in data_select) + _create_data_from_csv(cvs_path[item])) for item in split) -def AG_NEWS(root='.data', data_select=('train', 'test')): +def AG_NEWS(root='.data', split=('train', 'test')): """ Defines AG_NEWS datasets. Create supervised learning dataset: AG_NEWS @@ -63,7 +63,7 @@ def AG_NEWS(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -71,10 +71,10 @@ def AG_NEWS(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.AG_NEWS() """ - return _setup_datasets("AG_NEWS", root, data_select) + return _setup_datasets("AG_NEWS", root, split) -def SogouNews(root='.data', data_select=('train', 'test')): +def SogouNews(root='.data', split=('train', 'test')): """ Defines SogouNews datasets. Create supervised learning dataset: SogouNews @@ -83,7 +83,7 @@ def SogouNews(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -91,10 +91,10 @@ def SogouNews(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.SogouNews() """ - return _setup_datasets("SogouNews", root, data_select) + return _setup_datasets("SogouNews", root, split) -def DBpedia(root='.data', data_select=('train', 'test')): +def DBpedia(root='.data', split=('train', 'test')): """ Defines DBpedia datasets. Create supervised learning dataset: DBpedia @@ -103,7 +103,7 @@ def DBpedia(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -111,10 +111,10 @@ def DBpedia(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.DBpedia() """ - return _setup_datasets("DBpedia", root, data_select) + return _setup_datasets("DBpedia", root, split) -def YelpReviewPolarity(root='.data', data_select=('train', 'test')): +def YelpReviewPolarity(root='.data', split=('train', 'test')): """ Defines YelpReviewPolarity datasets. Create supervised learning dataset: YelpReviewPolarity @@ -123,7 +123,7 @@ def YelpReviewPolarity(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -131,10 +131,10 @@ def YelpReviewPolarity(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.YelpReviewPolarity() """ - return _setup_datasets("YelpReviewPolarity", root, data_select) + return _setup_datasets("YelpReviewPolarity", root, split) -def YelpReviewFull(root='.data', data_select=('train', 'test')): +def YelpReviewFull(root='.data', split=('train', 'test')): """ Defines YelpReviewFull datasets. Create supervised learning dataset: YelpReviewFull @@ -143,7 +143,7 @@ def YelpReviewFull(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -151,10 +151,10 @@ def YelpReviewFull(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.YelpReviewFull() """ - return _setup_datasets("YelpReviewFull", root, data_select) + return _setup_datasets("YelpReviewFull", root, split) -def YahooAnswers(root='.data', data_select=('train', 'test')): +def YahooAnswers(root='.data', split=('train', 'test')): """ Defines YahooAnswers datasets. Create supervised learning dataset: YahooAnswers @@ -163,7 +163,7 @@ def YahooAnswers(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -171,10 +171,10 @@ def YahooAnswers(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.YahooAnswers() """ - return _setup_datasets("YahooAnswers", root, data_select) + return _setup_datasets("YahooAnswers", root, split) -def AmazonReviewPolarity(root='.data', data_select=('train', 'test')): +def AmazonReviewPolarity(root='.data', split=('train', 'test')): """ Defines AmazonReviewPolarity datasets. Create supervised learning dataset: AmazonReviewPolarity @@ -183,7 +183,7 @@ def AmazonReviewPolarity(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -191,10 +191,10 @@ def AmazonReviewPolarity(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.AmazonReviewPolarity() """ - return _setup_datasets("AmazonReviewPolarity", root, data_select) + return _setup_datasets("AmazonReviewPolarity", root, split) -def AmazonReviewFull(root='.data', data_select=('train', 'test')): +def AmazonReviewFull(root='.data', split=('train', 'test')): """ Defines AmazonReviewFull datasets. Create supervised learning dataset: AmazonReviewFull @@ -203,7 +203,7 @@ def AmazonReviewFull(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. @@ -211,7 +211,7 @@ def AmazonReviewFull(root='.data', data_select=('train', 'test')): >>> train, test = torchtext.experimental.datasets.raw.AmazonReviewFull() """ - return _setup_datasets("AmazonReviewFull", root, data_select) + return _setup_datasets("AmazonReviewFull", root, split) def generate_imdb_data(key, extracted_files): @@ -224,7 +224,7 @@ def generate_imdb_data(key, extracted_files): yield label, f.read() -def IMDB(root='.data', data_select=('train', 'test')): +def IMDB(root='.data', split=('train', 'test')): """ Defines raw IMDB datasets. Create supervised learning dataset: IMDB @@ -233,20 +233,20 @@ def IMDB(root='.data', data_select=('train', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tuple for the returned datasets. Default: ('train', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() """ - data_select = check_default_set(data_select, target_select=('train', 'test')) + split = check_default_set(split, target_select=('train', 'test')) dataset_tar = download_from_url(URLS['IMDB'], root=root, hash_value=MD5['IMDB'], hash_type='md5') extracted_files = extract_archive(dataset_tar) return tuple(RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item], generate_imdb_data(item, - extracted_files)) for item in data_select) + extracted_files)) for item in split) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/translation.py b/torchtext/experimental/datasets/raw/translation.py index 25960bead8..d4ca29a028 100644 --- a/torchtext/experimental/datasets/raw/translation.py +++ b/torchtext/experimental/datasets/raw/translation.py @@ -116,8 +116,8 @@ def _construct_filepaths(paths, src_filename, tgt_filename): def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, - data_select, root): - data_select = check_default_set(data_select, ('train', 'valid', 'test')) + split, root): + split = check_default_set(split, ('train', 'valid', 'test')) if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \ and not isinstance(test_filenames, tuple): raise ValueError("All filenames must be tuples") @@ -175,7 +175,7 @@ def _setup_datasets(dataset_name, "Files are not found for data type {}".format(key)) datasets = [] - for key in data_select: + for key in split: src_data_iter = _read_text_iterator(data_filenames[key][0]) tgt_data_iter = _read_text_iterator(data_filenames[key][1]) @@ -192,7 +192,7 @@ def _iter(src_data_iter, tgt_data_iter): def Multi30k(train_filenames=("train.de", "train.en"), valid_filenames=("val.de", "val.en"), test_filenames=("test_2016_flickr.de", "test_2016_flickr.en"), - data_select=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data'): """ Define translation datasets: Multi30k Separately returns train/valid/test datasets as a tuple The available dataset include: @@ -253,7 +253,7 @@ def Multi30k(train_filenames=("train.de", "train.en"), Default: ('val.de', 'val.en') test_filenames: the source and target filenames for test. Default: ('test2016.de', 'test2016.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -264,7 +264,7 @@ def Multi30k(train_filenames=("train.de", "train.en"), >>> from torchtext.experimental.datasets.raw import Multi30k >>> train_dataset, valid_dataset, test_dataset = Multi30k() """ - return _setup_datasets("Multi30k", train_filenames, valid_filenames, test_filenames, data_select, root) + return _setup_datasets("Multi30k", train_filenames, valid_filenames, test_filenames, split, root) def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), @@ -272,7 +272,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), 'IWSLT16.TED.tst2013.de-en.en'), test_filenames=('IWSLT16.TED.tst2014.de-en.de', 'IWSLT16.TED.tst2014.de-en.en'), - data_select=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data'): """ Define translation datasets: IWSLT Separately returns train/valid/test datasets The available datasets include: @@ -419,7 +419,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), Default: ('IWSLT16.TED.tst2013.de-en.de', 'IWSLT16.TED.tst2013.de-en.en') test_filenames: the source and target filenames for test. Default: ('IWSLT16.TED.tst2014.de-en.de', 'IWSLT16.TED.tst2014.de-en.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -430,7 +430,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), >>> from torchtext.experimental.datasets.raw import IWSLT >>> train_dataset, valid_dataset, test_dataset = IWSLT() """ - return _setup_datasets("IWSLT", train_filenames, valid_filenames, test_filenames, data_select, root) + return _setup_datasets("IWSLT", train_filenames, valid_filenames, test_filenames, split, root) def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', @@ -439,7 +439,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', 'newstest2013.tok.bpe.32000.en'), test_filenames=('newstest2014.tok.bpe.32000.de', 'newstest2014.tok.bpe.32000.en'), - data_select=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data'): """ Define translation datasets: WMT14 Separately returns train/valid/test datasets The available datasets include: @@ -501,7 +501,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', Default: ('newstest2013.tok.bpe.32000.de', 'newstest2013.tok.bpe.32000.en') test_filenames: the source and target filenames for test. Default: ('newstest2014.tok.bpe.32000.de', 'newstest2014.tok.bpe.32000.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -512,7 +512,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', >>> from torchtext.experimental.datasets.raw import WMT14 >>> train_dataset, valid_dataset, test_dataset = WMT14() """ - return _setup_datasets("WMT14", train_filenames, valid_filenames, test_filenames, data_select, root) + return _setup_datasets("WMT14", train_filenames, valid_filenames, test_filenames, split, root) DATASETS = { diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 26b52ab206..2ff9ee873b 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -26,15 +26,15 @@ def build_vocab(data): return vocabs -def _setup_datasets(dataset_name, root, vocabs, data_select): - data_select = check_default_set(data_select, ('train', 'valid', 'test')) - raw_iter_tuple = raw.DATASETS[dataset_name](root=root, data_select=data_select) +def _setup_datasets(dataset_name, root, vocabs, split): + split = check_default_set(split, ('train', 'valid', 'test')) + raw_iter_tuple = raw.DATASETS[dataset_name](root=root, split=split) raw_data = {} - for name, raw_iter in zip(data_select, raw_iter_tuple): + for name, raw_iter in zip(split, raw_iter_tuple): raw_data[name] = list(raw_iter) if vocabs is None: - if "train" not in data_select: + if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocabs = build_vocab(raw_data["train"]) @@ -58,8 +58,8 @@ def _setup_datasets(dataset_name, root, vocabs, data_select): totensor(dtype=torch.long)) for idx in range(len(vocabs)) ] - logger_.info('Building datasets for {}'.format(data_select)) - return tuple(SequenceTaggingDataset(raw_data[item], vocabs, transformers) for item in data_select) + logger_.info('Building datasets for {}'.format(split)) + return tuple(SequenceTaggingDataset(raw_data[item], vocabs, transformers) for item in split) class SequenceTaggingDataset(torch.utils.data.Dataset): @@ -108,7 +108,7 @@ def get_vocabs(self): return self.vocabs -def UDPOS(root=".data", vocabs=None, data_select=("train", "valid", "test")): +def UDPOS(root=".data", vocabs=None, split=("train", "valid", "test")): """ Universal Dependencies English Web Treebank Separately returns the training, validation, and test dataset @@ -118,7 +118,7 @@ def UDPOS(root=".data", vocabs=None, data_select=("train", "valid", "test")): vocabs: A list of voabularies for each columns in the dataset. Must be in an instance of List Default: None - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -131,10 +131,10 @@ def UDPOS(root=".data", vocabs=None, data_select=("train", "valid", "test")): >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ - return _setup_datasets("UDPOS", root, vocabs, data_select) + return _setup_datasets("UDPOS", root, vocabs, split) -def CoNLL2000Chunking(root=".data", vocabs=None, data_select=("train", "test")): +def CoNLL2000Chunking(root=".data", vocabs=None, split=("train", "test")): """ CoNLL 2000 Chunking Dataset Separately returns the training and test dataset @@ -144,7 +144,7 @@ def CoNLL2000Chunking(root=".data", vocabs=None, data_select=("train", "test")): vocabs: A list of voabularies for each columns in the dataset. Must be in an instance of List Default: None - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -157,7 +157,7 @@ def CoNLL2000Chunking(root=".data", vocabs=None, data_select=("train", "test")): >>> train_dataset, test_dataset = CoNLL2000Chunking() """ - return _setup_datasets("CoNLL2000Chunking", root, vocabs, data_select) + return _setup_datasets("CoNLL2000Chunking", root, vocabs, split) DATASETS = {"UDPOS": UDPOS, "CoNLL2000Chunking": CoNLL2000Chunking} diff --git a/torchtext/experimental/datasets/text_classification.py b/torchtext/experimental/datasets/text_classification.py index 0646e8d63a..f968cc518b 100644 --- a/torchtext/experimental/datasets/text_classification.py +++ b/torchtext/experimental/datasets/text_classification.py @@ -69,18 +69,18 @@ def get_vocab(self): return self.vocab -def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select): +def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, split): text_transform = [] if tokenizer is None: tokenizer = get_tokenizer("basic_english") text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) - data_select = check_default_set(data_select, ('train', 'test')) - raw_datasets = raw.DATASETS[dataset_name](root=root, data_select=data_select) + split = check_default_set(split, ('train', 'test')) + raw_datasets = raw.DATASETS[dataset_name](root=root, split=split) # Materialize raw text iterable dataset - raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets)} + raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets)} if vocab is None: - if "train" not in data_select: + if "train" not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building Vocab based on train data') vocab = build_vocab(raw_data["train"], text_transform) @@ -92,16 +92,16 @@ def _setup_datasets(dataset_name, root, ngrams, vocab, tokenizer, data_select): label_transform = sequential_transforms(lambda x: 1 if x == 'pos' else 0, totensor(dtype=torch.long)) else: label_transform = sequential_transforms(totensor(dtype=torch.long)) - logger_.info('Building datasets for {}'.format(data_select)) + logger_.info('Building datasets for {}'.format(split)) return tuple( TextClassificationDataset( raw_data[item], vocab, (label_transform, text_transform) ) - for item in data_select + for item in split ) -def AG_NEWS(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def AG_NEWS(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines AG_NEWS datasets. The labels includes: - 1 : World @@ -123,7 +123,7 @@ def AG_NEWS(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('tr The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -137,14 +137,14 @@ def AG_NEWS(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('tr >>> train, test = AG_NEWS(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = AG_NEWS(tokenizer=tokenizer) - >>> train, = AG_NEWS(tokenizer=tokenizer, data_select='train') + >>> train, = AG_NEWS(tokenizer=tokenizer, split='train') """ - return _setup_datasets("AG_NEWS", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("AG_NEWS", root, ngrams, vocab, tokenizer, split) -def SogouNews(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def SogouNews(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines SogouNews datasets. The labels includes: - 1 : Sports @@ -167,7 +167,7 @@ def SogouNews(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=(' The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -181,14 +181,14 @@ def SogouNews(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=(' >>> train, test = SogouNews(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = SogouNews(tokenizer=tokenizer) - >>> train, = SogouNews(tokenizer=tokenizer, data_select='train') + >>> train, = SogouNews(tokenizer=tokenizer, split='train') """ - return _setup_datasets("SogouNews", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("SogouNews", root, ngrams, vocab, tokenizer, split) -def DBpedia(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def DBpedia(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines DBpedia datasets. The labels includes: - 1 : Company @@ -220,7 +220,7 @@ def DBpedia(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('tr The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -234,14 +234,14 @@ def DBpedia(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('tr >>> train, test = DBpedia(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = DBpedia(tokenizer=tokenizer) - >>> train, = DBpedia(tokenizer=tokenizer, data_select='train') + >>> train, = DBpedia(tokenizer=tokenizer, split='train') """ - return _setup_datasets("DBpedia", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("DBpedia", root, ngrams, vocab, tokenizer, split) -def YelpReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def YelpReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines YelpReviewPolarity datasets. The labels includes: - 1 : Negative polarity. @@ -261,7 +261,7 @@ def YelpReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, data_ The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -275,14 +275,14 @@ def YelpReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, data_ >>> train, test = YelpReviewPolarity(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = YelpReviewPolarity(tokenizer=tokenizer) - >>> train, = YelpReviewPolarity(tokenizer=tokenizer, data_select='train') + >>> train, = YelpReviewPolarity(tokenizer=tokenizer, split='train') """ - return _setup_datasets("YelpReviewPolarity", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("YelpReviewPolarity", root, ngrams, vocab, tokenizer, split) -def YelpReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def YelpReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines YelpReviewFull datasets. The labels includes: 1 - 5 : rating classes (5 is highly recommended). @@ -301,7 +301,7 @@ def YelpReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_sele The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -315,14 +315,14 @@ def YelpReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_sele >>> train, test = YelpReviewFull(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = YelpReviewFull(tokenizer=tokenizer) - >>> train, = YelpReviewFull(tokenizer=tokenizer, data_select='train') + >>> train, = YelpReviewFull(tokenizer=tokenizer, split='train') """ - return _setup_datasets("YelpReviewFull", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("YelpReviewFull", root, ngrams, vocab, tokenizer, split) -def YahooAnswers(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def YahooAnswers(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines YahooAnswers datasets. The labels includes: - 1 : Society & Culture @@ -350,7 +350,7 @@ def YahooAnswers(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -364,14 +364,14 @@ def YahooAnswers(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select >>> train, test = YahooAnswers(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = YahooAnswers(tokenizer=tokenizer) - >>> train, = YahooAnswers(tokenizer=tokenizer, data_select='train') + >>> train, = YahooAnswers(tokenizer=tokenizer, split='train') """ - return _setup_datasets("YahooAnswers", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("YahooAnswers", root, ngrams, vocab, tokenizer, split) -def AmazonReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def AmazonReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines AmazonReviewPolarity datasets. The labels includes: - 1 : Negative polarity @@ -391,7 +391,7 @@ def AmazonReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, dat The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -405,14 +405,14 @@ def AmazonReviewPolarity(root='.data', ngrams=1, vocab=None, tokenizer=None, dat >>> train, test = AmazonReviewPolarity(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = AmazonReviewPolarity(tokenizer=tokenizer) - >>> train, = AmazonReviewPolarity(tokenizer=tokenizer, data_select='train') + >>> train, = AmazonReviewPolarity(tokenizer=tokenizer, split='train') """ - return _setup_datasets("AmazonReviewPolarity", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("AmazonReviewPolarity", root, ngrams, vocab, tokenizer, split) -def AmazonReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def AmazonReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines AmazonReviewFull datasets. The labels includes: 1 - 5 : rating classes (5 is highly recommended) @@ -431,7 +431,7 @@ def AmazonReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_se The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -445,14 +445,14 @@ def AmazonReviewFull(root='.data', ngrams=1, vocab=None, tokenizer=None, data_se >>> train, test = AmazonReviewFull(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = AmazonReviewFull(tokenizer=tokenizer) - >>> train, = AmazonReviewFull(tokenizer=tokenizer, data_select='train') + >>> train, = AmazonReviewFull(tokenizer=tokenizer, split='train') """ - return _setup_datasets("AmazonReviewFull", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("AmazonReviewFull", root, ngrams, vocab, tokenizer, split) -def IMDB(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train', 'test')): +def IMDB(root='.data', ngrams=1, vocab=None, tokenizer=None, split=('train', 'test')): """ Defines IMDB datasets. The labels includes: - 0 : Negative @@ -473,7 +473,7 @@ def IMDB(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well. A custom tokenizer is callable function with input of a string and output of a token list. - data_select: a string or tuple for the returned datasets + split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -487,11 +487,11 @@ def IMDB(root='.data', ngrams=1, vocab=None, tokenizer=None, data_select=('train >>> train, test = IMDB(ngrams=3) >>> tokenizer = get_tokenizer("spacy") >>> train, test = IMDB(tokenizer=tokenizer) - >>> train, = IMDB(tokenizer=tokenizer, data_select='train') + >>> train, = IMDB(tokenizer=tokenizer, split='train') """ - return _setup_datasets("IMDB", root, ngrams, vocab, tokenizer, data_select) + return _setup_datasets("IMDB", root, ngrams, vocab, tokenizer, split) DATASETS = { diff --git a/torchtext/experimental/datasets/translation.py b/torchtext/experimental/datasets/translation.py index c38d17401d..1429b509ef 100644 --- a/torchtext/experimental/datasets/translation.py +++ b/torchtext/experimental/datasets/translation.py @@ -18,8 +18,8 @@ def apply_transforms(data): def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, - data_select, root, vocab, tokenizer): - data_select = check_default_set(data_select, ('train', 'valid', 'test')) + split, root, vocab, tokenizer): + split = check_default_set(split, ('train', 'valid', 'test')) src_vocab, tgt_vocab = vocab if tokenizer is None: src_tokenizer = get_tokenizer("spacy", language='de_core_news_sm') @@ -37,13 +37,13 @@ def _setup_datasets(dataset_name, raw_datasets = raw.DATASETS[dataset_name](train_filenames=train_filenames, valid_filenames=valid_filenames, test_filenames=test_filenames, - data_select=data_select, root=root) - raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(data_select, raw_datasets)} + split=split, root=root) + raw_data = {name: list(raw_dataset) for name, raw_dataset in zip(split, raw_datasets)} src_text_vocab_transform = sequential_transforms(src_tokenizer) tgt_text_vocab_transform = sequential_transforms(tgt_tokenizer) if src_vocab is None: - if 'train' not in data_select: + if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building src Vocab based on train data') src_vocab = build_vocab(raw_data["train"], @@ -55,7 +55,7 @@ def _setup_datasets(dataset_name, logger_.info('src Vocab has %d entries', len(src_vocab)) if tgt_vocab is None: - if 'train' not in data_select: + if 'train' not in split: raise TypeError("Must pass a vocab if train is not selected.") logger_.info('Building tgt Vocab based on train data') tgt_vocab = build_vocab(raw_data["train"], @@ -66,9 +66,9 @@ def _setup_datasets(dataset_name, raise TypeError("Passed tgt vocabulary is not of type Vocab") logger_.info('tgt Vocab has %d entries', len(tgt_vocab)) - logger_.info('Building datasets for {}'.format(data_select)) + logger_.info('Building datasets for {}'.format(split)) datasets = [] - for key in data_select: + for key in split: src_text_transform = sequential_transforms(src_text_vocab_transform, vocab_func(src_vocab), totensor(dtype=torch.long)) @@ -135,7 +135,7 @@ def get_vocab(self): def Multi30k(train_filenames=("train.de", "train.en"), valid_filenames=("val.de", "val.en"), test_filenames=("test_2016_flickr.de", "test_2016_flickr.en"), - data_select=('train', 'valid', 'test'), + split=('train', 'valid', 'test'), root='.data', vocab=(None, None), tokenizer=None): @@ -150,7 +150,7 @@ def Multi30k(train_filenames=("train.de", "train.en"), Default: ('val.de', 'val.en') test_filenames: the source and target filenames for test. Default: ('test2016.de', 'test2016.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -228,7 +228,7 @@ def Multi30k(train_filenames=("train.de", "train.en"), """ return _setup_datasets("Multi30k", train_filenames, valid_filenames, test_filenames, - data_select, root, vocab, tokenizer) + split, root, vocab, tokenizer) def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), @@ -236,7 +236,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), 'IWSLT16.TED.tst2013.de-en.en'), test_filenames=('IWSLT16.TED.tst2014.de-en.de', 'IWSLT16.TED.tst2014.de-en.en'), - data_select=('train', 'valid', 'test'), + split=('train', 'valid', 'test'), root='.data', vocab=(None, None), tokenizer=None): @@ -252,7 +252,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), Default: ('IWSLT16.TED.tst2013.de-en.de', 'IWSLT16.TED.tst2013.de-en.en') test_filenames: the source and target filenames for test. Default: ('IWSLT16.TED.tst2014.de-en.de', 'IWSLT16.TED.tst2014.de-en.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -417,7 +417,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), """ return _setup_datasets("IWSLT", train_filenames, valid_filenames, test_filenames, - data_select, root, vocab, tokenizer) + split, root, vocab, tokenizer) def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', @@ -426,7 +426,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', 'newstest2013.tok.bpe.32000.en'), test_filenames=('newstest2014.tok.bpe.32000.de', 'newstest2014.tok.bpe.32000.en'), - data_select=('train', 'valid', 'test'), + split=('train', 'valid', 'test'), root='.data', vocab=(None, None), tokenizer=None): @@ -493,7 +493,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', Default: ('newstest2013.tok.bpe.32000.de', 'newstest2013.tok.bpe.32000.en') test_filenames: the source and target filenames for test. Default: ('newstest2014.tok.bpe.32000.de', 'newstest2014.tok.bpe.32000.en') - data_select: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets, Default: ('train', 'valid', 'test') By default, all the three datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -520,7 +520,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', """ return _setup_datasets("WMT14", train_filenames, valid_filenames, test_filenames, - data_select, root, vocab, tokenizer) + split, root, vocab, tokenizer) DATASETS = {'Multi30k': Multi30k, 'IWSLT': IWSLT, 'WMT14': WMT14} From 4bbabda46f51f711482a23a93acbe7720ce41f28 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 8 Feb 2021 15:07:13 -0800 Subject: [PATCH 2/2] fix a typo in doc --- torchtext/experimental/datasets/language_modeling.py | 6 +++--- torchtext/experimental/datasets/raw/language_modeling.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 6e50b27df1..e841314c91 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -102,7 +102,7 @@ def WikiText2(tokenizer=None, root='.data', vocab=None, split=('train', 'valid', root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - split: a string or tupel for the returned datasets. Default: ('train', 'valid','test') + split: a string or tuple for the returned datasets. Default: ('train', 'valid','test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -136,7 +136,7 @@ def WikiText103(tokenizer=None, root='.data', vocab=None, split=('train', 'valid root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - split: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'valid', 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab @@ -171,7 +171,7 @@ def PennTreebank(tokenizer=None, root='.data', vocab=None, split=('train', 'vali root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - split: a string or tupel for the returned datasets. Default: ('train', 'valid', 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'valid', 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index b369a060b2..9edb6e735c 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -66,7 +66,7 @@ def WikiText2(root='.data', split=('train', 'valid', 'test')): Args: root: Directory where the datasets are saved. Default: ".data" - split: a string or tupel for the returned datasets. Default: ('train', 'valid, 'test') + split: a string or tuple for the returned datasets. Default: ('train', 'valid, 'test') By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. If 'train' is not in the tuple or string, a vocab