From e4b094cb8c9dfdfd4aa8a2b24fcaa096bb816883 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Sun, 10 May 2020 22:46:55 -0400 Subject: [PATCH 01/25] torchtext.experimental.raw: update __init__.py --- torchtext/experimental/datasets/raw/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/raw/__init__.py b/torchtext/experimental/datasets/raw/__init__.py index 61accbe2a1..5c1eb4202d 100644 --- a/torchtext/experimental/datasets/raw/__init__.py +++ b/torchtext/experimental/datasets/raw/__init__.py @@ -1,6 +1,7 @@ from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ YelpReviewFull, YahooAnswers, \ AmazonReviewPolarity, AmazonReviewFull, IMDB +from .language_modeling import WikiText2, WikiText103, PennTreebank __all__ = ['IMDB', 'AG_NEWS', @@ -10,4 +11,7 @@ 'YelpReviewFull', 'YahooAnswers', 'AmazonReviewPolarity', - 'AmazonReviewFull'] + 'AmazonReviewFull', + 'WikiText2', + 'WikiText103', + 'PennTreebank'] From eb5409c83d0e6059c4528bbaebf62cc4885a73ad Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 12 May 2020 22:17:48 -0400 Subject: [PATCH 02/25] add language_modeling.py in raw dataset --- .../datasets/raw/language_modeling.py | 162 ++++++++++++++++++ 1 file changed, 162 insertions(+) create mode 100644 torchtext/experimental/datasets/raw/language_modeling.py diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py new file mode 100644 index 0000000000..d16649dd1b --- /dev/null +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -0,0 +1,162 @@ +import sys +import torch +import logging +import io +from torchtext.utils import download_from_url, extract_archive + +URLS = { + 'WikiText2': + 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', + 'WikiText103': + 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', + 'PennTreebank': + ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'] +} + + +class RawTextIterableDataset(torch.utils.data.IterableDataset): + """Defines an abstraction for raw text iterable datasets. + """ + + def __init__(self, iterator): + """Initiate language modeling dataset. + """ + super(RawTextIterableDataset, self).__init__() + self._iterator = iterator + self.has_setup = False + self.start = 0 + self.num_lines = None + + def setup_iter(self, start=0, num_lines=None): + self.start = start + self.num_lines = num_lines + self.has_setup = True + + def __iter__(self): + if not self.has_setup: + self.setup_iter() + + for i, item in enumerate(self._iterator): + if i == self.start: + break + + num_lines = self.num_lines if self.num_lines is not None else sys.maxsize + for _ in range(num_lines): + yield item + try: + item = next(self._iterator) + except StopIteration: + break + + def get_iterator(self): + return self._iterator + + +def _get_datafile_path(key, extracted_files): + for fname in extracted_files: + if key in fname: + return fname + + +def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'valid')): + if isinstance(data_select, str): + data_select = [data_select] + if not set(data_select).issubset(set(('train', 'test', 'valid'))): + raise TypeError('data_select is not supported!') + + if dataset_name == 'PennTreebank': + extracted_files = [] + select_to_index = {'train': 0, 'test': 1, 'valid': 2} + extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], + root=root) for key in data_select] + else: + dataset_tar = download_from_url(URLS[dataset_name], root=root) + extracted_files = extract_archive(dataset_tar) + + _path = {} + for item in data_select: + _path[item] = _get_datafile_path(item, extracted_files) + + data = {} + for item in _path.keys(): + logging.info('Creating {} data'.format(item)) + data[item] = iter(io.open(_path[item], encoding="utf8")) + + return tuple(RawTextIterableDataset(data[item]) for item in data_select) + + +def WikiText2(*args, **kwargs): + """ Defines WikiText2 datasets. + + Create language modeling dataset: WikiText2 + Separately returns the train/test/valid set + + Arguments: + root: Directory where the datasets are saved. Default: ".data" + data_select: a string or tupel for the returned datasets + (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. + + Examples: + >>> from torchtext.experimental.raw.datasets import WikiText2 + >>> train_dataset, test_dataset, valid_dataset = WikiText2() + >>> valid_dataset, = WikiText2(data_select='valid') + + """ + + return _setup_datasets(*(("WikiText2",) + args), **kwargs) + + +def WikiText103(*args, **kwargs): + """ Defines WikiText103 datasets. + + Create language modeling dataset: WikiText103 + Separately returns the train/test/valid set + + Arguments: + root: Directory where the datasets are saved. Default: ".data" + data_select: the returned datasets (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test'). + If 'train' is not in the tuple, an vocab object should be provided which will + be used to process valid and/or test data. + + Examples: + >>> from torchtext.experimental.datasets.raw import WikiText103 + >>> train_dataset, test_dataset, valid_dataset = WikiText103() + >>> valid_dataset, = WikiText103(data_select='valid') + """ + + return _setup_datasets(*(("WikiText103",) + args), **kwargs) + + +def PennTreebank(*args, **kwargs): + """ Defines PennTreebank datasets. + + Create language modeling dataset: PennTreebank + Separately returns the train/test/valid set + + Arguments: + root: Directory where the datasets are saved. Default: ".data" + data_select: a string or tupel for the returned datasets + (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. + + Examples: + >>> from torchtext.experimental.datasets.raw import PennTreebank + >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) + >>> valid_dataset, = PennTreebank(data_select='valid') + + """ + + return _setup_datasets(*(("PennTreebank",) + args), **kwargs) From ebfd0a7c53846c907d0f4b75d6c4b4722437a168 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 13 May 2020 22:38:21 -0400 Subject: [PATCH 03/25] fix typo --- torchtext/experimental/datasets/text_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtext/experimental/datasets/text_classification.py b/torchtext/experimental/datasets/text_classification.py index fd6e2ad8f3..b47b13d6f5 100644 --- a/torchtext/experimental/datasets/text_classification.py +++ b/torchtext/experimental/datasets/text_classification.py @@ -32,7 +32,7 @@ def build_vocab(data, transforms): return build_vocab_from_iterator(tok_list) -def squential_transforms(*transforms): +def sequential_transforms(*transforms): def _forward(txt_input): for transform in transforms: txt_input = transform(txt_input) @@ -92,7 +92,7 @@ def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') - text_transform = squential_transforms(tokenizer, ngrams_func(ngrams)) + text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) if isinstance(data_select, str): data_select = [data_select] @@ -107,9 +107,9 @@ def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data['train'], text_transform) - text_transform = squential_transforms(text_transform, vocab_func(vocab), + text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) - label_transform = squential_transforms(totensor(dtype=torch.long)) + label_transform = sequential_transforms(totensor(dtype=torch.long)) return tuple(TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select) From 5dcf6e21e0cb9e10233fdf78a9c9f25591834bec Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 13 May 2020 23:58:50 -0400 Subject: [PATCH 04/25] add new language_modeling dataset --- .../datasets/language_modeling.py | 163 ++++++++---------- 1 file changed, 71 insertions(+), 92 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 6c0ec5799e..5af3885880 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -1,22 +1,38 @@ import torch -import logging -import io -from torchtext.utils import download_from_url, extract_archive -from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer -from torchtext.vocab import Vocab +from torchtext.vocab import build_vocab_from_iterator from torchtext.data.functional import numericalize_tokens_from_iterator +from torchtext.experimental.datasets import raw -URLS = { - 'WikiText2': - 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', - 'WikiText103': - 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', - 'PennTreebank': - ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'] -} + +def vocab_func(vocab): + def _forward(tok_iter): + return [vocab[tok] for tok in tok_iter] + + return _forward + + +def totensor(dtype): + def _forward(ids_list): + return torch.tensor(ids_list).to(dtype) + + return _forward + + +def build_vocab(data, transforms): + tok_list = [] + for txt in data: + tok_list.append(transforms(txt)) + return build_vocab_from_iterator(tok_list) + + +def sequential_transforms(*transforms): + def _forward(txt_input): + for transform in transforms: + txt_input = transform(txt_input) + return txt_input + + return _forward class LanguageModelingDataset(torch.utils.data.Dataset): @@ -29,7 +45,7 @@ class LanguageModelingDataset(torch.utils.data.Dataset): """ - def __init__(self, data, vocab): + def __init__(self, data, vocab, transforms): """Initiate language modeling dataset. Arguments: @@ -37,22 +53,18 @@ def __init__(self, data, vocab): numericalizing the string tokens. torch.tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() vocab: Vocabulary object used for dataset. - - Examples: - >>> from torchtext.vocab import build_vocab_from_iterator - >>> data = torch.tensor([token_id_1, token_id_2, - token_id_3, token_id_1]).long() - >>> vocab = build_vocab_from_iterator([['language', 'modeling']]) - >>> dataset = LanguageModelingDataset(data, vocab) + transforms: Text string transforms. """ super(LanguageModelingDataset, self).__init__() self.data = data self.vocab = vocab + self.transforms = transforms def __getitem__(self, i): - return self.data[i] + txt = self.data[i] + return self.transforms(txt) def __len__(self): return len(self.data) @@ -65,63 +77,31 @@ def get_vocab(self): return self.vocab -def _get_datafile_path(key, extracted_files): - for fname in extracted_files: - if key in fname: - return fname - - -def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', vocab=None, removed_tokens=[], - data_select=('train', 'test', 'valid')): +def _setup_datasets(dataset_name, root='.data', vocab=None, + tokenizer=None, data_select=('train', 'test', 'valid')): + if tokenizer is None: + tokenizer = get_tokenizer('basic_english') + text_transform = sequential_transforms(tokenizer) if isinstance(data_select, str): data_select = [data_select] - if not set(data_select).issubset(set(('train', 'test', 'valid'))): - raise TypeError('data_select is not supported!') - - if dataset_name == 'PennTreebank': - extracted_files = [] - select_to_index = {'train': 0, 'test': 1, 'valid': 2} - extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], - root=root) for key in data_select] - else: - dataset_tar = download_from_url(URLS[dataset_name], root=root) - extracted_files = extract_archive(dataset_tar) - - _path = {} - for item in data_select: - _path[item] = _get_datafile_path(item, extracted_files) + if not set(data_select).issubset(set(('train', 'valid', 'test'))): + raise TypeError('Given data selection {} is not supported!'.format(data_select)) + train, valid, test = DATASETS[dataset_name](root=root) + + # Cache raw text iterable dataset + raw_data = {'train': [txt for txt in train], + 'valid': [txt for txt in valid], + 'test': [txt for txt in test]} if vocab is None: - if 'train' not in _path.keys(): + if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") - logging.info('Building Vocab based on {}'.format(_path['train'])) - txt_iter = iter(tokenizer(row) for row in io.open(_path['train'], - encoding="utf8")) - vocab = build_vocab_from_iterator(txt_iter) - logging.info('Vocab has {} entries'.format(len(vocab))) - else: - if not isinstance(vocab, Vocab): - raise TypeError("Passed vocabulary is not of type Vocab") - - data = {} - for item in _path.keys(): - data[item] = [] - logging.info('Creating {} data'.format(item)) - txt_iter = iter(tokenizer(row) for row in io.open(_path[item], - encoding="utf8")) - _iter = numericalize_tokens_from_iterator( - vocab, txt_iter, removed_tokens) - for tokens in _iter: - data[item] += [token_id for token_id in tokens] - - for key in data_select: - if data[key] == []: - raise TypeError('Dataset {} is empty!'.format(key)) - - return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) - for d in data_select) + vocab = build_vocab(raw_data['train'], text_transform) + text_transform = sequential_transforms(text_transform, vocab_func(vocab), + totensor(dtype=torch.long)) + return tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform) + for item in data_select) def WikiText2(*args, **kwargs): @@ -131,14 +111,13 @@ def WikiText2(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -168,19 +147,13 @@ def WikiText103(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - data_select: the returned datasets (Default: ('train', 'test','valid')) - By default, all the three datasets (train, test, valid) are generated. Users - could also choose any one or two of them, for example ('train', 'test'). - If 'train' is not in the tuple, an vocab object should be provided which will - be used to process valid and/or test data. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -210,14 +183,13 @@ def PennTreebank(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -238,3 +210,10 @@ def PennTreebank(*args, **kwargs): """ return _setup_datasets(*(("PennTreebank",) + args), **kwargs) + + +DATASETS = { + 'WikiText2': raw.WikiText2, + 'WikiText103': raw.WikiText103, + 'PennTreebank': raw.PennTreebank +} From 7be2bfe0efbc0a23689233947f5e4ee330d0e06a Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 13 May 2020 23:58:50 -0400 Subject: [PATCH 05/25] add new language_modeling dataset --- .../datasets/language_modeling.py | 162 ++++++++---------- 1 file changed, 70 insertions(+), 92 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 6c0ec5799e..da4eddb30e 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -1,22 +1,38 @@ import torch -import logging -import io -from torchtext.utils import download_from_url, extract_archive -from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer -from torchtext.vocab import Vocab +from torchtext.vocab import build_vocab_from_iterator from torchtext.data.functional import numericalize_tokens_from_iterator +from torchtext.experimental.datasets import raw -URLS = { - 'WikiText2': - 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', - 'WikiText103': - 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', - 'PennTreebank': - ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'] -} + +def vocab_func(vocab): + def _forward(tok_iter): + return [vocab[tok] for tok in tok_iter] + + return _forward + + +def totensor(dtype): + def _forward(ids_list): + return torch.tensor(ids_list).to(dtype) + + return _forward + + +def build_vocab(data, transforms): + tok_list = [] + for txt in data: + tok_list.append(transforms(txt)) + return build_vocab_from_iterator(tok_list) + + +def sequential_transforms(*transforms): + def _forward(txt_input): + for transform in transforms: + txt_input = transform(txt_input) + return txt_input + + return _forward class LanguageModelingDataset(torch.utils.data.Dataset): @@ -29,7 +45,7 @@ class LanguageModelingDataset(torch.utils.data.Dataset): """ - def __init__(self, data, vocab): + def __init__(self, data, vocab, transforms): """Initiate language modeling dataset. Arguments: @@ -37,19 +53,14 @@ def __init__(self, data, vocab): numericalizing the string tokens. torch.tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() vocab: Vocabulary object used for dataset. - - Examples: - >>> from torchtext.vocab import build_vocab_from_iterator - >>> data = torch.tensor([token_id_1, token_id_2, - token_id_3, token_id_1]).long() - >>> vocab = build_vocab_from_iterator([['language', 'modeling']]) - >>> dataset = LanguageModelingDataset(data, vocab) + transforms: Text string transforms. """ super(LanguageModelingDataset, self).__init__() - self.data = data self.vocab = vocab + self.transforms = transforms + self.data = torch.cat(tuple(transforms(row) for row in data), axis=0) def __getitem__(self, i): return self.data[i] @@ -65,63 +76,31 @@ def get_vocab(self): return self.vocab -def _get_datafile_path(key, extracted_files): - for fname in extracted_files: - if key in fname: - return fname - - -def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', vocab=None, removed_tokens=[], - data_select=('train', 'test', 'valid')): +def _setup_datasets(dataset_name, root='.data', vocab=None, + tokenizer=None, data_select=('train', 'test', 'valid')): + if tokenizer is None: + tokenizer = get_tokenizer('basic_english') + text_transform = sequential_transforms(tokenizer) if isinstance(data_select, str): data_select = [data_select] - if not set(data_select).issubset(set(('train', 'test', 'valid'))): - raise TypeError('data_select is not supported!') - - if dataset_name == 'PennTreebank': - extracted_files = [] - select_to_index = {'train': 0, 'test': 1, 'valid': 2} - extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], - root=root) for key in data_select] - else: - dataset_tar = download_from_url(URLS[dataset_name], root=root) - extracted_files = extract_archive(dataset_tar) - - _path = {} - for item in data_select: - _path[item] = _get_datafile_path(item, extracted_files) + if not set(data_select).issubset(set(('train', 'valid', 'test'))): + raise TypeError('Given data selection {} is not supported!'.format(data_select)) + train, valid, test = DATASETS[dataset_name](root=root) + + # Cache raw text iterable dataset + raw_data = {'train': [txt for txt in train], + 'valid': [txt for txt in valid], + 'test': [txt for txt in test]} if vocab is None: - if 'train' not in _path.keys(): + if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") - logging.info('Building Vocab based on {}'.format(_path['train'])) - txt_iter = iter(tokenizer(row) for row in io.open(_path['train'], - encoding="utf8")) - vocab = build_vocab_from_iterator(txt_iter) - logging.info('Vocab has {} entries'.format(len(vocab))) - else: - if not isinstance(vocab, Vocab): - raise TypeError("Passed vocabulary is not of type Vocab") - - data = {} - for item in _path.keys(): - data[item] = [] - logging.info('Creating {} data'.format(item)) - txt_iter = iter(tokenizer(row) for row in io.open(_path[item], - encoding="utf8")) - _iter = numericalize_tokens_from_iterator( - vocab, txt_iter, removed_tokens) - for tokens in _iter: - data[item] += [token_id for token_id in tokens] - - for key in data_select: - if data[key] == []: - raise TypeError('Dataset {} is empty!'.format(key)) - - return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) - for d in data_select) + vocab = build_vocab(raw_data['train'], text_transform) + text_transform = sequential_transforms(text_transform, vocab_func(vocab), + totensor(dtype=torch.long)) + return tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform) + for item in data_select) def WikiText2(*args, **kwargs): @@ -131,14 +110,13 @@ def WikiText2(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -168,19 +146,13 @@ def WikiText103(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - data_select: the returned datasets (Default: ('train', 'test','valid')) - By default, all the three datasets (train, test, valid) are generated. Users - could also choose any one or two of them, for example ('train', 'test'). - If 'train' is not in the tuple, an vocab object should be provided which will - be used to process valid and/or test data. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -210,14 +182,13 @@ def PennTreebank(*args, **kwargs): Separately returns the train/test/valid set Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: []) data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -238,3 +209,10 @@ def PennTreebank(*args, **kwargs): """ return _setup_datasets(*(("PennTreebank",) + args), **kwargs) + + +DATASETS = { + 'WikiText2': raw.WikiText2, + 'WikiText103': raw.WikiText103, + 'PennTreebank': raw.PennTreebank +} From 71baaf1ff82cb2fce93a63089e891e54d873bed6 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Thu, 14 May 2020 21:28:50 -0400 Subject: [PATCH 06/25] Revert "fix typo". Will submit another dedicated PR for typos This reverts commit ebfd0a7c --- torchtext/experimental/datasets/text_classification.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtext/experimental/datasets/text_classification.py b/torchtext/experimental/datasets/text_classification.py index b47b13d6f5..fd6e2ad8f3 100644 --- a/torchtext/experimental/datasets/text_classification.py +++ b/torchtext/experimental/datasets/text_classification.py @@ -32,7 +32,7 @@ def build_vocab(data, transforms): return build_vocab_from_iterator(tok_list) -def sequential_transforms(*transforms): +def squential_transforms(*transforms): def _forward(txt_input): for transform in transforms: txt_input = transform(txt_input) @@ -92,7 +92,7 @@ def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, text_transform = [] if tokenizer is None: tokenizer = get_tokenizer('basic_english') - text_transform = sequential_transforms(tokenizer, ngrams_func(ngrams)) + text_transform = squential_transforms(tokenizer, ngrams_func(ngrams)) if isinstance(data_select, str): data_select = [data_select] @@ -107,9 +107,9 @@ def _setup_datasets(dataset_name, root='.data', ngrams=1, vocab=None, if 'train' not in data_select: raise TypeError("Must pass a vocab if train is not selected.") vocab = build_vocab(raw_data['train'], text_transform) - text_transform = sequential_transforms(text_transform, vocab_func(vocab), + text_transform = squential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) - label_transform = sequential_transforms(totensor(dtype=torch.long)) + label_transform = squential_transforms(totensor(dtype=torch.long)) return tuple(TextClassificationDataset(raw_data[item], vocab, (label_transform, text_transform)) for item in data_select) From 77041052c51bc771ba5e7ee0f01bcd9aa8ca25b5 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Thu, 14 May 2020 21:39:10 -0400 Subject: [PATCH 07/25] remove duplicated functions. --- .../experimental/datasets/language_modeling.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index da4eddb30e..cd315e753e 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -1,22 +1,8 @@ import torch from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator -from torchtext.data.functional import numericalize_tokens_from_iterator from torchtext.experimental.datasets import raw - - -def vocab_func(vocab): - def _forward(tok_iter): - return [vocab[tok] for tok in tok_iter] - - return _forward - - -def totensor(dtype): - def _forward(ids_list): - return torch.tensor(ids_list).to(dtype) - - return _forward +from .text_classification import vocab_func, totensor def build_vocab(data, transforms): From b9e4645c1de110c8fb5e1ca624e8b8e560d1a491 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Thu, 14 May 2020 22:01:51 -0400 Subject: [PATCH 08/25] fix incorrect dataset orders --- torchtext/experimental/datasets/language_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index cd315e753e..47c27f01d2 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -72,7 +72,7 @@ def _setup_datasets(dataset_name, root='.data', vocab=None, data_select = [data_select] if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError('Given data selection {} is not supported!'.format(data_select)) - train, valid, test = DATASETS[dataset_name](root=root) + train, test, valid = DATASETS[dataset_name](root=root) # Cache raw text iterable dataset raw_data = {'train': [txt for txt in train], From 37514b5cf5128f26c78cca535bfa01b18b078a54 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Fri, 15 May 2020 12:47:54 -0400 Subject: [PATCH 09/25] remove setup_iter --- .../datasets/raw/language_modeling.py | 23 ++++--------------- 1 file changed, 4 insertions(+), 19 deletions(-) diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index d16649dd1b..397d112ed1 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -20,34 +20,19 @@ class RawTextIterableDataset(torch.utils.data.IterableDataset): """Defines an abstraction for raw text iterable datasets. """ - def __init__(self, iterator): + def __init__(self, iterator, start=0, num_lines=None): """Initiate language modeling dataset. """ super(RawTextIterableDataset, self).__init__() self._iterator = iterator - self.has_setup = False - self.start = 0 - self.num_lines = None - - def setup_iter(self, start=0, num_lines=None): self.start = start self.num_lines = num_lines - self.has_setup = True def __iter__(self): - if not self.has_setup: - self.setup_iter() - for i, item in enumerate(self._iterator): - if i == self.start: - break - - num_lines = self.num_lines if self.num_lines is not None else sys.maxsize - for _ in range(num_lines): - yield item - try: - item = next(self._iterator) - except StopIteration: + if i >= self.start: + yield item + if (self.num_lines is not None) and (i == (self.start + self.num_lines)): break def get_iterator(self): From 011704167ed2da14bd571c883aac95f9db98ef82 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Fri, 15 May 2020 12:48:16 -0400 Subject: [PATCH 10/25] explicitly select data --- torchtext/experimental/datasets/language_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 47c27f01d2..ad57ae69e1 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -72,7 +72,7 @@ def _setup_datasets(dataset_name, root='.data', vocab=None, data_select = [data_select] if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError('Given data selection {} is not supported!'.format(data_select)) - train, test, valid = DATASETS[dataset_name](root=root) + train, test, valid = DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) # Cache raw text iterable dataset raw_data = {'train': [txt for txt in train], From ecafa7b3bee5b25d90c3968ab89d4599df5f7fdd Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Fri, 15 May 2020 13:01:11 -0400 Subject: [PATCH 11/25] remove sys --- torchtext/experimental/datasets/raw/language_modeling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 397d112ed1..0b1e1cda57 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -1,4 +1,3 @@ -import sys import torch import logging import io From 777c8f50867b57ae68b63866f23e2589edae8e86 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 19 May 2020 22:17:43 -0400 Subject: [PATCH 12/25] use functionals --- torchtext/experimental/datasets/language_modeling.py | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index ad57ae69e1..c733084c8d 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -2,7 +2,7 @@ from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator from torchtext.experimental.datasets import raw -from .text_classification import vocab_func, totensor +from torchtext.experimental.functional import vocab_func, totensor, sequential_transforms def build_vocab(data, transforms): @@ -12,15 +12,6 @@ def build_vocab(data, transforms): return build_vocab_from_iterator(tok_list) -def sequential_transforms(*transforms): - def _forward(txt_input): - for transform in transforms: - txt_input = transform(txt_input) - return txt_input - - return _forward - - class LanguageModelingDataset(torch.utils.data.Dataset): """Defines a dataset for language modeling. Currently, we only support the following datasets: From f433b409638c2e21e63ca70ca8adaabf05dcdf19 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 19 May 2020 22:59:11 -0400 Subject: [PATCH 13/25] restore the order of vocab/tokenizer --- torchtext/experimental/datasets/language_modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index c733084c8d..8c6849d033 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -53,8 +53,8 @@ def get_vocab(self): return self.vocab -def _setup_datasets(dataset_name, root='.data', vocab=None, - tokenizer=None, data_select=('train', 'test', 'valid')): +def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, + data_select=('train', 'test', 'valid')): if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) From 645a749232668e5c17823c37413d3e2d4b997738 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 19 May 2020 23:15:48 -0400 Subject: [PATCH 14/25] Point language_modeling.DATASETS to local functions --- torchtext/experimental/datasets/language_modeling.py | 10 +++++----- .../experimental/datasets/raw/language_modeling.py | 7 +++++++ 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 8c6849d033..426cb444b0 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -1,7 +1,7 @@ import torch from torchtext.data.utils import get_tokenizer from torchtext.vocab import build_vocab_from_iterator -from torchtext.experimental.datasets import raw +from torchtext.experimental.datasets.raw import language_modeling as raw from torchtext.experimental.functional import vocab_func, totensor, sequential_transforms @@ -63,7 +63,7 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, data_select = [data_select] if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError('Given data selection {} is not supported!'.format(data_select)) - train, test, valid = DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) + train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) # Cache raw text iterable dataset raw_data = {'train': [txt for txt in train], @@ -189,7 +189,7 @@ def PennTreebank(*args, **kwargs): DATASETS = { - 'WikiText2': raw.WikiText2, - 'WikiText103': raw.WikiText103, - 'PennTreebank': raw.PennTreebank + 'WikiText2': WikiText2, + 'WikiText103': WikiText103, + 'PennTreebank': PennTreebank } diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 0b1e1cda57..f609d64295 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -144,3 +144,10 @@ def PennTreebank(*args, **kwargs): """ return _setup_datasets(*(("PennTreebank",) + args), **kwargs) + + +DATASETS = { + 'WikiText2': WikiText2, + 'WikiText103': WikiText103, + 'PennTreebank': PennTreebank +} From e27058f52b98bac6e545a69076053c284bfae37c Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 20 May 2020 22:29:07 -0400 Subject: [PATCH 15/25] get rid of _get_datafile_path --- torchtext/experimental/datasets/raw/language_modeling.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index f609d64295..20c01bf263 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -38,12 +38,6 @@ def get_iterator(self): return self._iterator -def _get_datafile_path(key, extracted_files): - for fname in extracted_files: - if key in fname: - return fname - - def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'valid')): if isinstance(data_select, str): data_select = [data_select] @@ -61,7 +55,7 @@ def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'v _path = {} for item in data_select: - _path[item] = _get_datafile_path(item, extracted_files) + _path[item] = [f_name for f_name in extracted_files if item in f_name] data = {} for item in _path.keys(): From f77c53c7c41a548be58e8fed9569ed5aedb17ce9 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 20 May 2020 22:41:46 -0400 Subject: [PATCH 16/25] really get rid of _get_datafile_path. --- .../experimental/datasets/raw/language_modeling.py | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 20c01bf263..9daeb02ff9 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -11,7 +11,8 @@ 'PennTreebank': ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'] + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'], + 'WMTNewsCrawl': 'http://www.statmt.org/wmt11/training-monolingual-news-2010.tgz' } @@ -55,7 +56,9 @@ def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'v _path = {} for item in data_select: - _path[item] = [f_name for f_name in extracted_files if item in f_name] + for fname in extracted_files: + if item in fname: + _path[item] = fname data = {} for item in _path.keys(): @@ -140,6 +143,12 @@ def PennTreebank(*args, **kwargs): return _setup_datasets(*(("PennTreebank",) + args), **kwargs) +def WMTNewsCrawl(*args, **kwargs): + """ Defines WMT News Crawl. + """ + return _setup_datasets(*(("WMTNewsCrawl",) + args), **kwargs) + + DATASETS = { 'WikiText2': WikiText2, 'WikiText103': WikiText103, From fb36e7b5295c7f7d673e2ed44e6b03df65f7e9be Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Sat, 23 May 2020 21:24:31 -0400 Subject: [PATCH 17/25] add WMTNewsCrawl --- torchtext/experimental/datasets/__init__.py | 3 +- .../datasets/language_modeling.py | 46 ++++++++++++++++--- .../experimental/datasets/raw/__init__.py | 5 +- .../datasets/raw/language_modeling.py | 28 +++++++++-- 4 files changed, 69 insertions(+), 13 deletions(-) diff --git a/torchtext/experimental/datasets/__init__.py b/torchtext/experimental/datasets/__init__.py index ac2faa423b..d91f60ff55 100644 --- a/torchtext/experimental/datasets/__init__.py +++ b/torchtext/experimental/datasets/__init__.py @@ -1,4 +1,4 @@ -from .language_modeling import LanguageModelingDataset, WikiText2, WikiText103, PennTreebank # NOQA +from .language_modeling import LanguageModelingDataset, WikiText2, WikiText103, PennTreebank, WMTNewsCrawl # NOQA from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ YelpReviewFull, YahooAnswers, \ AmazonReviewPolarity, AmazonReviewFull, IMDB @@ -7,6 +7,7 @@ 'WikiText2', 'WikiText103', 'PennTreebank', + 'WMTNewsCrawl', 'IMDB', 'AG_NEWS', 'SogouNews', diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 426cb444b0..28eed7aca3 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -19,6 +19,7 @@ class LanguageModelingDataset(torch.utils.data.Dataset): - WikiText2 - WikiText103 - PennTreebank + - WMTNewsCrawl """ @@ -63,12 +64,16 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, data_select = [data_select] if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError('Given data selection {} is not supported!'.format(data_select)) - train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) - # Cache raw text iterable dataset - raw_data = {'train': [txt for txt in train], - 'valid': [txt for txt in valid], - 'test': [txt for txt in test]} + if dataset_name == 'WMTNewsCrawl': + train, = raw.DATASETS[dataset_name](root=root, data_select=('train',)) + raw_data = {'train': [txt for txt in train]} + else: + train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) + # Cache raw text iterable dataset + raw_data = {'train': [txt for txt in train], + 'valid': [txt for txt in valid], + 'test': [txt for txt in test]} if vocab is None: if 'train' not in data_select: @@ -188,8 +193,37 @@ def PennTreebank(*args, **kwargs): return _setup_datasets(*(("PennTreebank",) + args), **kwargs) +def WMTNewsCrawl(*args, **kwargs): + """ Defines WMTNewsCrawl datasets. + + Create language modeling dataset: WMTNewsCrawl + returns the train set + + Arguments: + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. + tokenizer: the tokenizer used to preprocess raw text data. + The default one is basic_english tokenizer in fastText. spacy tokenizer + is supported as well (see example below). A custom tokenizer is callable + function with input of a string and output of a token list. + data_select: a string or tupel for the returned datasets + (Default: ('train',)) + + Examples: + >>> from torchtext.experimental.datasets import WMTNewsCrawl + >>> from torchtext.data.utils import get_tokenizer + >>> tokenizer = get_tokenizer("spacy") + >>> train_dataset, = WMTNewsCrawl(tokenizer=tokenizer, data_select='train') + + """ + + return _setup_datasets(*(("WMTNewsCrawl",) + args), **kwargs) + + DATASETS = { 'WikiText2': WikiText2, 'WikiText103': WikiText103, - 'PennTreebank': PennTreebank + 'PennTreebank': PennTreebank, + 'WMTNewsCrawl': WMTNewsCrawl } diff --git a/torchtext/experimental/datasets/raw/__init__.py b/torchtext/experimental/datasets/raw/__init__.py index 5c1eb4202d..9c988ec05c 100644 --- a/torchtext/experimental/datasets/raw/__init__.py +++ b/torchtext/experimental/datasets/raw/__init__.py @@ -1,7 +1,7 @@ from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ YelpReviewFull, YahooAnswers, \ AmazonReviewPolarity, AmazonReviewFull, IMDB -from .language_modeling import WikiText2, WikiText103, PennTreebank +from .language_modeling import WikiText2, WikiText103, PennTreebank, WMTNewsCrawl __all__ = ['IMDB', 'AG_NEWS', @@ -14,4 +14,5 @@ 'AmazonReviewFull', 'WikiText2', 'WikiText103', - 'PennTreebank'] + 'PennTreebank', + 'WMTNewsCrawl'] diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 9daeb02ff9..8557616484 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -39,7 +39,7 @@ def get_iterator(self): return self._iterator -def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'valid')): +def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'valid'), **kwargs): if isinstance(data_select, str): data_select = [data_select] if not set(data_select).issubset(set(('train', 'test', 'valid'))): @@ -50,6 +50,17 @@ def _setup_datasets(dataset_name, root='.data', data_select=('train', 'test', 'v select_to_index = {'train': 0, 'test': 1, 'valid': 2} extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], root=root) for key in data_select] + elif dataset_name == 'WMTNewsCrawl': + if not (data_select == ['train'] or set(data_select).issubset(set(('train',)))): + raise ValueError("WMTNewsCrawl only creates a training dataset. " + "data_select should be 'train' " + "or ('train',), got {}.".format(data_select)) + dataset_tar = download_from_url(URLS[dataset_name], root=root) + extracted_files = extract_archive(dataset_tar) + year = kwargs.get('year', 2010) + language = kwargs.get('language', 'en') + file_name = 'news.{}.{}.shuffled'.format(year, language) + extracted_files = [f for f in extracted_files if file_name in f] else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) @@ -125,7 +136,7 @@ def PennTreebank(*args, **kwargs): Arguments: root: Directory where the datasets are saved. Default: ".data" - data_select: a string or tupel for the returned datasets + data_select: a string or tuple for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test') or @@ -135,7 +146,7 @@ def PennTreebank(*args, **kwargs): Examples: >>> from torchtext.experimental.datasets.raw import PennTreebank - >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) + >>> train_dataset, test_dataset, valid_dataset = PennTreebank() >>> valid_dataset, = PennTreebank(data_select='valid') """ @@ -145,12 +156,21 @@ def PennTreebank(*args, **kwargs): def WMTNewsCrawl(*args, **kwargs): """ Defines WMT News Crawl. + + Create language modeling dataset: WMTNewsCrawl + + Arguments: + root: Directory where the datasets are saved. Default: ".data" + data_select: a string or tuple for the returned datasets. + (Default: 'train') """ + return _setup_datasets(*(("WMTNewsCrawl",) + args), **kwargs) DATASETS = { 'WikiText2': WikiText2, 'WikiText103': WikiText103, - 'PennTreebank': PennTreebank + 'PennTreebank': PennTreebank, + 'WMTNewsCrawl': WMTNewsCrawl } From 321871ddaaf2feced2eca67de64937ec78bf7c9c Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Sat, 23 May 2020 21:51:19 -0400 Subject: [PATCH 18/25] restore setup_iter --- torchtext/experimental/datasets/raw/language_modeling.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 8557616484..a867108978 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -25,10 +25,18 @@ def __init__(self, iterator, start=0, num_lines=None): """ super(RawTextIterableDataset, self).__init__() self._iterator = iterator + self.has_setup = False self.start = start self.num_lines = num_lines + def setup_iter(self, start=0, num_lines=None): + self.start = start + self.num_lines = num_lines + self.has_setup = True + def __iter__(self): + if not self.has_setup: + self.setup_iter() for i, item in enumerate(self._iterator): if i >= self.start: yield item From 0409da8a7ef7ae5b292595b039fda34c012168f9 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 2 Jun 2020 22:17:59 -0400 Subject: [PATCH 19/25] add `single_line` option --- .../datasets/language_modeling.py | 28 +++++++++++++------ 1 file changed, 20 insertions(+), 8 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 28eed7aca3..4892985d00 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -23,7 +23,7 @@ class LanguageModelingDataset(torch.utils.data.Dataset): """ - def __init__(self, data, vocab, transforms): + def __init__(self, data, vocab, transforms, single_line): """Initiate language modeling dataset. Arguments: @@ -38,10 +38,17 @@ def __init__(self, data, vocab, transforms): super(LanguageModelingDataset, self).__init__() self.vocab = vocab self.transforms = transforms - self.data = torch.cat(tuple(transforms(row) for row in data), axis=0) + self.single_line = single_line + if single_line == True: + self.data = torch.cat(tuple(transforms(row) for row in data), axis=0) + else: + self.data = data def __getitem__(self, i): - return self.data[i] + if self.single_line: + return self.data[i] + else: + return self.transforms(self.data[i]) def __len__(self): return len(self.data) @@ -55,7 +62,7 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, - data_select=('train', 'test', 'valid')): + data_select=('train', 'test', 'valid'), single_line=True): if tokenizer is None: tokenizer = get_tokenizer('basic_english') text_transform = sequential_transforms(tokenizer) @@ -71,9 +78,14 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, else: train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) # Cache raw text iterable dataset - raw_data = {'train': [txt for txt in train], - 'valid': [txt for txt in valid], - 'test': [txt for txt in test]} + if single_line: + raw_data = {'train': [" ".join([txt for txt in train]), ], + 'valid': [" ".join(txt for txt in valid), ], + 'test': [" ".join(txt for txt in test), ]} + else: + raw_data = {'train': [txt for txt in train], + 'valid': [txt for txt in valid], + 'test': [txt for txt in test]} if vocab is None: if 'train' not in data_select: @@ -81,7 +93,7 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, vocab = build_vocab(raw_data['train'], text_transform) text_transform = sequential_transforms(text_transform, vocab_func(vocab), totensor(dtype=torch.long)) - return tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform) + return tuple(LanguageModelingDataset(raw_data[item], vocab, text_transform, single_line) for item in data_select) From 0a6f6ac8fac7d1b8bd0863938dfc16b1a2a63727 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Tue, 2 Jun 2020 22:18:18 -0400 Subject: [PATCH 20/25] minor change --- torchtext/experimental/datasets/language_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 4892985d00..cb24e8540f 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -39,7 +39,7 @@ def __init__(self, data, vocab, transforms, single_line): self.vocab = vocab self.transforms = transforms self.single_line = single_line - if single_line == True: + if single_line: self.data = torch.cat(tuple(transforms(row) for row in data), axis=0) else: self.data = data From 66fe231d17f314a9561915ce912909a98bb713b8 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 3 Jun 2020 20:40:27 -0400 Subject: [PATCH 21/25] Update docs. --- .../datasets/language_modeling.py | 41 +++++++++++++------ 1 file changed, 28 insertions(+), 13 deletions(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index cb24e8540f..29f46863be 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -104,13 +104,13 @@ def WikiText2(*args, **kwargs): Separately returns the train/test/valid set Arguments: - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -118,6 +118,10 @@ def WikiText2(*args, **kwargs): just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. + single_line: whether to return all tokens in a single line. + (Default: True) + By default, all lines in raw text file are concatenated into a single line. + Use `single_line = False` if one wants to get data line by line. Examples: >>> from torchtext.experimental.datasets import WikiText2 @@ -140,13 +144,13 @@ def WikiText103(*args, **kwargs): Separately returns the train/test/valid set Arguments: - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -154,6 +158,10 @@ def WikiText103(*args, **kwargs): just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. + single_line: whether to return all tokens in a single line. + (Default: True) + By default, all lines in raw text file are concatenated into a single line. + Use `single_line = False` if one wants to get data line by line. Examples: >>> from torchtext.experimental.datasets import WikiText103 @@ -176,13 +184,13 @@ def PennTreebank(*args, **kwargs): Separately returns the train/test/valid set Arguments: - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. data_select: a string or tupel for the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users @@ -190,6 +198,10 @@ def PennTreebank(*args, **kwargs): just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. + single_line: whether to return all tokens in a single line. + (Default: True) + By default, all lines in raw text file are concatenated into a single line. + Use `single_line = False` if one wants to get data line by line. Examples: >>> from torchtext.experimental.datasets import PennTreebank @@ -212,16 +224,19 @@ def WMTNewsCrawl(*args, **kwargs): returns the train set Arguments: - root: Directory where the datasets are saved. Default: ".data" - vocab: Vocabulary used for dataset. If None, it will generate a new - vocabulary based on the train data set. tokenizer: the tokenizer used to preprocess raw text data. The default one is basic_english tokenizer in fastText. spacy tokenizer is supported as well (see example below). A custom tokenizer is callable function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. data_select: a string or tupel for the returned datasets (Default: ('train',)) - + single_line: whether to return all tokens in a single line. + (Default: True) + By default, all lines in raw text file are concatenated into a single line. + Use `single_line = False` if one wants to get data line by line. Examples: >>> from torchtext.experimental.datasets import WMTNewsCrawl >>> from torchtext.data.utils import get_tokenizer From 5f8a63a04969b71ae64fe502b7cce4f9cb15d24d Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 3 Jun 2020 22:20:23 -0400 Subject: [PATCH 22/25] take care of `single_line` in WMTNewsCrawl --- torchtext/experimental/datasets/language_modeling.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 29f46863be..7f27ddd7bb 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -74,7 +74,10 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, if dataset_name == 'WMTNewsCrawl': train, = raw.DATASETS[dataset_name](root=root, data_select=('train',)) - raw_data = {'train': [txt for txt in train]} + if single_line: + raw_data = {'train': [" ".join([txt for txt in train]), ]} + else: + raw_data = {'train': [txt for txt in train]} else: train, test, valid = raw.DATASETS[dataset_name](root=root, data_select=('train', 'test', 'valid')) # Cache raw text iterable dataset From 1ed393eecf16242034b2566963740348cdd5e249 Mon Sep 17 00:00:00 2001 From: Runmin Zhang Date: Wed, 3 Jun 2020 23:27:05 -0400 Subject: [PATCH 23/25] add unit test for WMTNewsCrawl --- test/data/test_builtin_datasets.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index b388dc61de..364e824199 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -76,7 +76,7 @@ def test_penntreebank_legacy(self): def test_penntreebank(self): from torchtext.experimental.datasets import PennTreebank - # smoke test to ensure wikitext2 works properly + # smoke test to ensure penn treebank works properly train_dataset, test_dataset, valid_dataset = PennTreebank() self.assertEqual(len(train_dataset), 924412) self.assertEqual(len(test_dataset), 82114) @@ -86,6 +86,16 @@ def test_penntreebank(self): tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) + def test_wmtnewscrawl(self): + from torchtext.experimental.datasets import WMTNewsCrawl + # smoke test to ensure WMTNewsCrawl works properly + train_dataset, = WMTNewsCrawl() + self.assertEqual(len(train_dataset), 399857558) + + vocab = train_dataset.get_vocab() + tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] + self.assertEqual(tokens_ids, [3, 1075, 3572, 1134]) + def test_text_classification(self): # smoke test to ensure ag_news dataset works properly From e55b60bc36ddd233f05de8a856ce9bad811b507c Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 4 Jun 2020 09:16:19 -0700 Subject: [PATCH 24/25] remove the unit test for WMTNewsCrawl because it takes too long time to download files --- test/data/test_builtin_datasets.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 6d3628d7e2..45e0d096bb 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -88,16 +88,6 @@ def test_penntreebank(self): tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) - def test_wmtnewscrawl(self): - from torchtext.experimental.datasets import WMTNewsCrawl - # smoke test to ensure WMTNewsCrawl works properly - train_dataset, = WMTNewsCrawl() - self.assertEqual(len(train_dataset), 399857558) - - vocab = train_dataset.get_vocab() - tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] - self.assertEqual(tokens_ids, [3, 1075, 3572, 1134]) - def test_text_classification(self): # smoke test to ensure ag_news dataset works properly From a2e7b02dbebb001bbde2283360a66bd18c2e6f78 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 4 Jun 2020 13:51:44 -0700 Subject: [PATCH 25/25] raise an error for single_line --- torchtext/experimental/datasets/language_modeling.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtext/experimental/datasets/language_modeling.py b/torchtext/experimental/datasets/language_modeling.py index 7f27ddd7bb..f44de4af88 100644 --- a/torchtext/experimental/datasets/language_modeling.py +++ b/torchtext/experimental/datasets/language_modeling.py @@ -72,6 +72,8 @@ def _setup_datasets(dataset_name, tokenizer=None, root='.data', vocab=None, if not set(data_select).issubset(set(('train', 'valid', 'test'))): raise TypeError('Given data selection {} is not supported!'.format(data_select)) + if not single_line and dataset_name != 'WikiText103': + raise TypeError('single_line must be True except for WikiText103') if dataset_name == 'WMTNewsCrawl': train, = raw.DATASETS[dataset_name](root=root, data_select=('train',)) if single_line: