From 45d53de2ad8ffa20f27b28bee593247cdff670ba Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 23 Oct 2019 14:01:40 -0700 Subject: [PATCH 01/47] Move PennTreebank, WikiText103, WikiText2 to torchtext.legacy --- torchtext/__init__.py | 4 +- torchtext/legacy/__init__.py | 5 + torchtext/legacy/datasets/__init__.py | 4 + .../legacy/datasets/language_modeling.py | 220 ++++++++++++++++++ 4 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 torchtext/legacy/__init__.py create mode 100644 torchtext/legacy/datasets/__init__.py create mode 100644 torchtext/legacy/datasets/language_modeling.py diff --git a/torchtext/__init__.py b/torchtext/__init__.py index e799058a88..2701580acb 100644 --- a/torchtext/__init__.py +++ b/torchtext/__init__.py @@ -2,10 +2,12 @@ from . import datasets from . import utils from . import vocab +from . import legacy __version__ = '0.4.0' __all__ = ['data', 'datasets', 'utils', - 'vocab'] + 'vocab', + 'legacy'] diff --git a/torchtext/legacy/__init__.py b/torchtext/legacy/__init__.py new file mode 100644 index 0000000000..ab3e4d382f --- /dev/null +++ b/torchtext/legacy/__init__.py @@ -0,0 +1,5 @@ +from . import datasets + +__version__ = '0.4.0' + +__all__ = ['datasets'] diff --git a/torchtext/legacy/datasets/__init__.py b/torchtext/legacy/datasets/__init__.py new file mode 100644 index 0000000000..0320fc3d56 --- /dev/null +++ b/torchtext/legacy/datasets/__init__.py @@ -0,0 +1,4 @@ +from .language_modeling import LanguageModelingDataset, WikiText2, WikiText103, PennTreebank # NOQA + + +__all__ = ['LanguageModelingDataset'] diff --git a/torchtext/legacy/datasets/language_modeling.py b/torchtext/legacy/datasets/language_modeling.py new file mode 100644 index 0000000000..ed7b912efc --- /dev/null +++ b/torchtext/legacy/datasets/language_modeling.py @@ -0,0 +1,220 @@ +from torchtext import data +import io +import warnings + + +class LanguageModelingDataset(data.Dataset): + """Defines a dataset for language modeling.""" + + def __init__(self, path, text_field, newline_eos=True, + encoding='utf-8', **kwargs): + """Create a LanguageModelingDataset given a path and a field. + + Arguments: + path: Path to the data file. + text_field: The field that will be used for text data. + newline_eos: Whether to add an token for every newline in the + data file. Default: True. + Remaining keyword arguments: Passed to the constructor of + data.Dataset. + """ + warnings.warn("You are using a legacy code, which is not being covered " + "by the PyTorch team now !") + fields = [('text', text_field)] + text = [] + with io.open(path, encoding=encoding) as f: + for line in f: + text += text_field.preprocess(line) + if newline_eos: + text.append(u'') + + examples = [data.Example.fromlist([text], fields)] + super(LanguageModelingDataset, self).__init__( + examples, fields, **kwargs) + + +class WikiText2(LanguageModelingDataset): + + urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'] + name = 'wikitext-2' + dirname = 'wikitext-2' + + @classmethod + def splits(cls, text_field, root='.data', train='wiki.train.tokens', + validation='wiki.valid.tokens', test='wiki.test.tokens', + **kwargs): + """Create dataset objects for splits of the WikiText-2 dataset. + + This is the most flexible way to use the dataset. + + Arguments: + text_field: The field that will be used for text data. + root: The root directory that the dataset's zip archive will be + expanded into; therefore the directory in whose wikitext-2 + subdirectory the data files will be stored. + train: The filename of the train data. Default: 'wiki.train.tokens'. + validation: The filename of the validation data, or None to not + load the validation set. Default: 'wiki.valid.tokens'. + test: The filename of the test data, or None to not load the test + set. Default: 'wiki.test.tokens'. + """ + return super(WikiText2, cls).splits( + root=root, train=train, validation=validation, test=test, + text_field=text_field, **kwargs) + + @classmethod + def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', + vectors=None, **kwargs): + """Create iterator objects for splits of the WikiText-2 dataset. + + This is the simplest way to use the dataset, and assumes common + defaults for field, vocabulary, and iterator parameters. + + Arguments: + batch_size: Batch size. + bptt_len: Length of sequences for backpropagation through time. + device: Device to create batches on. Use -1 for CPU and None for + the currently active GPU device. + root: The root directory that the dataset's zip archive will be + expanded into; therefore the directory in whose wikitext-2 + subdirectory the data files will be stored. + wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the + text field. The word vectors are accessible as + train.dataset.fields['text'].vocab.vectors. + Remaining keyword arguments: Passed to the splits method. + """ + TEXT = data.Field() + + train, val, test = cls.splits(TEXT, root=root, **kwargs) + + TEXT.build_vocab(train, vectors=vectors) + + return data.BPTTIterator.splits( + (train, val, test), batch_size=batch_size, bptt_len=bptt_len, + device=device) + + +class WikiText103(LanguageModelingDataset): + + urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip'] + name = 'wikitext-103' + dirname = 'wikitext-103' + + @classmethod + def splits(cls, text_field, root='.data', train='wiki.train.tokens', + validation='wiki.valid.tokens', test='wiki.test.tokens', + **kwargs): + """Create dataset objects for splits of the WikiText-103 dataset. + + This is the most flexible way to use the dataset. + + Arguments: + text_field: The field that will be used for text data. + root: The root directory that the dataset's zip archive will be + expanded into; therefore the directory in whose wikitext-103 + subdirectory the data files will be stored. + train: The filename of the train data. Default: 'wiki.train.tokens'. + validation: The filename of the validation data, or None to not + load the validation set. Default: 'wiki.valid.tokens'. + test: The filename of the test data, or None to not load the test + set. Default: 'wiki.test.tokens'. + """ + return super(WikiText103, cls).splits( + root=root, train=train, validation=validation, test=test, + text_field=text_field, **kwargs) + + @classmethod + def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', + vectors=None, **kwargs): + """Create iterator objects for splits of the WikiText-103 dataset. + + This is the simplest way to use the dataset, and assumes common + defaults for field, vocabulary, and iterator parameters. + + Arguments: + batch_size: Batch size. + bptt_len: Length of sequences for backpropagation through time. + device: Device to create batches on. Use -1 for CPU and None for + the currently active GPU device. + root: The root directory that the dataset's zip archive will be + expanded into; therefore the directory in whose wikitext-2 + subdirectory the data files will be stored. + wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the + text field. The word vectors are accessible as + train.dataset.fields['text'].vocab.vectors. + Remaining keyword arguments: Passed to the splits method. + """ + TEXT = data.Field() + + train, val, test = cls.splits(TEXT, root=root, **kwargs) + + TEXT.build_vocab(train, vectors=vectors) + + return data.BPTTIterator.splits( + (train, val, test), batch_size=batch_size, bptt_len=bptt_len, + device=device) + + +class PennTreebank(LanguageModelingDataset): + """The Penn Treebank dataset. + A relatively small dataset originally created for POS tagging. + + References + ---------- + Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993). + Building a Large Annotated Corpus of English: The Penn Treebank + """ + + urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'] + name = 'penn-treebank' + dirname = '' + + @classmethod + def splits(cls, text_field, root='.data', train='ptb.train.txt', + validation='ptb.valid.txt', test='ptb.test.txt', + **kwargs): + """Create dataset objects for splits of the Penn Treebank dataset. + + Arguments: + text_field: The field that will be used for text data. + root: The root directory where the data files will be stored. + train: The filename of the train data. Default: 'ptb.train.txt'. + validation: The filename of the validation data, or None to not + load the validation set. Default: 'ptb.valid.txt'. + test: The filename of the test data, or None to not load the test + set. Default: 'ptb.test.txt'. + """ + return super(PennTreebank, cls).splits( + root=root, train=train, validation=validation, test=test, + text_field=text_field, **kwargs) + + @classmethod + def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', + vectors=None, **kwargs): + """Create iterator objects for splits of the Penn Treebank dataset. + + This is the simplest way to use the dataset, and assumes common + defaults for field, vocabulary, and iterator parameters. + + Arguments: + batch_size: Batch size. + bptt_len: Length of sequences for backpropagation through time. + device: Device to create batches on. Use -1 for CPU and None for + the currently active GPU device. + root: The root directory where the data files will be stored. + wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the + text field. The word vectors are accessible as + train.dataset.fields['text'].vocab.vectors. + Remaining keyword arguments: Passed to the splits method. + """ + TEXT = data.Field() + + train, val, test = cls.splits(TEXT, root=root, **kwargs) + + TEXT.build_vocab(train, vectors=vectors) + + return data.BPTTIterator.splits( + (train, val, test), batch_size=batch_size, bptt_len=bptt_len, + device=device) From 1f95483af8c537eb94f4b57fadf1637bf0dedd10 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 25 Oct 2019 11:30:20 -0700 Subject: [PATCH 02/47] Some initial work. --- torchtext/datasets/language_modeling.py | 119 +++++++++++++++++++----- 1 file changed, 98 insertions(+), 21 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 775fe51d9e..765b929ff2 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -1,33 +1,110 @@ +import torch +import logging from .. import data import io - - -class LanguageModelingDataset(data.Dataset): +from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader +from torchtext.vocab import build_vocab_from_iterator +from torchtext.data.utils import get_tokenizer +from torchtext.vocab import Vocab +from tqdm import tqdm + +URLS = { + 'WikiText2': + 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip', + 'WikiText103': + 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', + 'PennTreebank': + ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'] +} + +def _read_text_iterator(data_path, tokenizer): + tokenizer = get_tokenizer("basic_english") + with io.open(data_path, encoding="utf8") as f: + reader = unicode_csv_reader(f) + for row in reader: + tokens = ' '.join(row[1:]) + tokens = tokenizer(tokens) + yield tokens + + +def _create_data_from_iterator(vocab, iterator, include_unk): + data = [] + labels = [] + with tqdm(unit_scale=0, unit='lines') as t: + for cls, tokens in iterator: + if include_unk: + tokens = torch.tensor([vocab[token] for token in tokens]) + else: + token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] + for token in tokens])) + tokens = torch.tensor(token_ids) + if len(tokens) == 0: + logging.info('Row contains no tokens.') + data.append((cls, tokens)) + labels.append(cls) + t.update(1) + return data, set(labels) + +class LanguageModelingDataset(torch.utils.data.Dataset): """Defines a dataset for language modeling.""" - def __init__(self, path, text_field, newline_eos=True, - encoding='utf-8', **kwargs): + def __init__(self, data, vocab): """Create a LanguageModelingDataset given a path and a field. Arguments: path: Path to the data file. - text_field: The field that will be used for text data. - newline_eos: Whether to add an token for every newline in the - data file. Default: True. - Remaining keyword arguments: Passed to the constructor of - data.Dataset. """ - fields = [('text', text_field)] - text = [] - with io.open(path, encoding=encoding) as f: - for line in f: - text += text_field.preprocess(line) - if newline_eos: - text.append(u'') - - examples = [data.Example.fromlist([text], fields)] - super(LanguageModelingDataset, self).__init__( - examples, fields, **kwargs) + super(LanguageModelingDataset, self).__init__() + self._data = data + self._vocab = vocab + + def __getitem__(self, i): + return self._data[i] + + def __len__(self): + return len(self._data) + + def __iter__(self): + for x in self._data: + yield x + + def get_vocab(self): + return self._vocab + +def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), + root='.data', ngrams=2, vocab=None, include_unk=False): + dataset_tar = download_from_url(URLS[dataset_name], root=root) + extracted_files = extract_archive(dataset_tar) + + for fname in extracted_files: + if 'train' in fname: + train_path = fname + if 'test' in fname: + test_path = fname + if 'valid' in fname: + valid_path = fname + + if vocab is None: + logging.info('Building Vocab based on {}'.format(train_path)) + vocab = build_vocab_from_iterator(_read_text_iterator(train_path, tokenizer)) + else: + if not isinstance(vocab, Vocab): + raise TypeError("Passed vocabulary is not of type Vocab") + logging.info('Vocab has {} entries'.format(len(vocab))) + logging.info('Creating training data') + train_data = _create_data_from_iterator( + vocab, _read_text_iterator(train_path, tokenizer), include_unk) + logging.info('Creating testing data') + test_data = _create_data_from_iterator( + vocab, _read_text_iterator(test_path, tokenizer), include_unk) + logging.info('Creating valid data') + valid_data = _create_data_from_iterator( + vocab, _read_text_iterator(valid_path, tokenizer), include_unk) + return (LanguageModelingDataset(train_data, vocab), + LanguageModelingDataset(test_data, vocab), + LanguageModelingDataset(valid_data, vocab)) class WikiText2(LanguageModelingDataset): From 97af9d0db57a6750586f5958c1abd90cc1659f5d Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 29 Oct 2019 10:30:55 -0700 Subject: [PATCH 03/47] Re-write three datasets. --- torchtext/datasets/language_modeling.py | 240 ++++-------------------- 1 file changed, 34 insertions(+), 206 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 765b929ff2..4eb50080dd 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -1,7 +1,7 @@ import torch import logging -from .. import data import io +import os from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer @@ -15,37 +15,35 @@ 'https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip', 'PennTreebank': ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'] + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt', + 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt'] } + def _read_text_iterator(data_path, tokenizer): - tokenizer = get_tokenizer("basic_english") with io.open(data_path, encoding="utf8") as f: reader = unicode_csv_reader(f) for row in reader: - tokens = ' '.join(row[1:]) - tokens = tokenizer(tokens) + tokens = tokenizer(' '.join(row)) yield tokens def _create_data_from_iterator(vocab, iterator, include_unk): - data = [] - labels = [] + _data = [] with tqdm(unit_scale=0, unit='lines') as t: - for cls, tokens in iterator: + for tokens in iterator: if include_unk: - tokens = torch.tensor([vocab[token] for token in tokens]) + tokens = [vocab[token] for token in tokens] else: token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] for token in tokens])) - tokens = torch.tensor(token_ids) + tokens = token_ids if len(tokens) == 0: logging.info('Row contains no tokens.') - data.append((cls, tokens)) - labels.append(cls) + _data += tokens t.update(1) - return data, set(labels) + return torch.Tensor(_data).long() + class LanguageModelingDataset(torch.utils.data.Dataset): """Defines a dataset for language modeling.""" @@ -73,18 +71,23 @@ def __iter__(self): def get_vocab(self): return self._vocab -def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', ngrams=2, vocab=None, include_unk=False): - dataset_tar = download_from_url(URLS[dataset_name], root=root) - extracted_files = extract_archive(dataset_tar) - for fname in extracted_files: - if 'train' in fname: - train_path = fname - if 'test' in fname: - test_path = fname - if 'valid' in fname: - valid_path = fname +def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), + root='.data', vocab=None, include_unk=False): + if dataset_name == 'PennTreebank': + train_path = download_from_url(URLS['PennTreebank'][0], root=root) + test_path = download_from_url(URLS['PennTreebank'][1], root=root) + valid_path = download_from_url(URLS['PennTreebank'][2], root=root) + else: + dataset_tar = download_from_url(URLS[dataset_name], root=root) + extracted_files = extract_archive(dataset_tar) + for fname in extracted_files: + if 'train' in fname: + train_path = os.path.join(root, fname) + if 'test' in fname: + test_path = os.path.join(root, fname) + if 'valid' in fname: + valid_path = os.path.join(root, fname) if vocab is None: logging.info('Building Vocab based on {}'.format(train_path)) @@ -107,188 +110,13 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), LanguageModelingDataset(valid_data, vocab)) -class WikiText2(LanguageModelingDataset): - - urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip'] - name = 'wikitext-2' - dirname = 'wikitext-2' - - @classmethod - def splits(cls, text_field, root='.data', train='wiki.train.tokens', - validation='wiki.valid.tokens', test='wiki.test.tokens', - **kwargs): - """Create dataset objects for splits of the WikiText-2 dataset. - - This is the most flexible way to use the dataset. - - Arguments: - text_field: The field that will be used for text data. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'wiki.train.tokens'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'wiki.valid.tokens'. - test: The filename of the test data, or None to not load the test - set. Default: 'wiki.test.tokens'. - """ - return super(WikiText2, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the WikiText-2 dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Arguments: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the - text field. The word vectors are accessible as - train.dataset.fields['text'].vocab.vectors. - Remaining keyword arguments: Passed to the splits method. - """ - TEXT = data.Field() - - train, val, test = cls.splits(TEXT, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) - - -class WikiText103(LanguageModelingDataset): - - urls = ['https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-v1.zip'] - name = 'wikitext-103' - dirname = 'wikitext-103' - - @classmethod - def splits(cls, text_field, root='.data', train='wiki.train.tokens', - validation='wiki.valid.tokens', test='wiki.test.tokens', - **kwargs): - """Create dataset objects for splits of the WikiText-103 dataset. - - This is the most flexible way to use the dataset. +def WikiText2(*args, **kwargs): + return _setup_datasets(*(("WikiText2",) + args), **kwargs) - Arguments: - text_field: The field that will be used for text data. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-103 - subdirectory the data files will be stored. - train: The filename of the train data. Default: 'wiki.train.tokens'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'wiki.valid.tokens'. - test: The filename of the test data, or None to not load the test - set. Default: 'wiki.test.tokens'. - """ - return super(WikiText103, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the WikiText-103 dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Arguments: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory that the dataset's zip archive will be - expanded into; therefore the directory in whose wikitext-2 - subdirectory the data files will be stored. - wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the - text field. The word vectors are accessible as - train.dataset.fields['text'].vocab.vectors. - Remaining keyword arguments: Passed to the splits method. - """ - TEXT = data.Field() - - train, val, test = cls.splits(TEXT, root=root, **kwargs) - - TEXT.build_vocab(train, vectors=vectors) - - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) - - -class PennTreebank(LanguageModelingDataset): - """The Penn Treebank dataset. - A relatively small dataset originally created for POS tagging. - - References - ---------- - Marcus, Mitchell P., Marcinkiewicz, Mary Ann & Santorini, Beatrice (1993). - Building a Large Annotated Corpus of English: The Penn Treebank - """ - - urls = ['https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.train.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.valid.txt', - 'https://raw.githubusercontent.com/wojzaremba/lstm/master/data/ptb.test.txt'] - name = 'penn-treebank' - dirname = '' - - @classmethod - def splits(cls, text_field, root='.data', train='ptb.train.txt', - validation='ptb.valid.txt', test='ptb.test.txt', - **kwargs): - """Create dataset objects for splits of the Penn Treebank dataset. - - Arguments: - text_field: The field that will be used for text data. - root: The root directory where the data files will be stored. - train: The filename of the train data. Default: 'ptb.train.txt'. - validation: The filename of the validation data, or None to not - load the validation set. Default: 'ptb.valid.txt'. - test: The filename of the test data, or None to not load the test - set. Default: 'ptb.test.txt'. - """ - return super(PennTreebank, cls).splits( - root=root, train=train, validation=validation, test=test, - text_field=text_field, **kwargs) - - @classmethod - def iters(cls, batch_size=32, bptt_len=35, device=0, root='.data', - vectors=None, **kwargs): - """Create iterator objects for splits of the Penn Treebank dataset. - - This is the simplest way to use the dataset, and assumes common - defaults for field, vocabulary, and iterator parameters. - - Arguments: - batch_size: Batch size. - bptt_len: Length of sequences for backpropagation through time. - device: Device to create batches on. Use -1 for CPU and None for - the currently active GPU device. - root: The root directory where the data files will be stored. - wv_dir, wv_type, wv_dim: Passed to the Vocab constructor for the - text field. The word vectors are accessible as - train.dataset.fields['text'].vocab.vectors. - Remaining keyword arguments: Passed to the splits method. - """ - TEXT = data.Field() - train, val, test = cls.splits(TEXT, root=root, **kwargs) +def WikiText103(*args, **kwargs): + return _setup_datasets(*(("WikiText103",) + args), **kwargs) - TEXT.build_vocab(train, vectors=vectors) - return data.BPTTIterator.splits( - (train, val, test), batch_size=batch_size, bptt_len=bptt_len, - device=device) +def PennTreebank(*args, **kwargs): + return _setup_datasets(*(("PennTreebank",) + args), **kwargs) From cc127dec11c174533ca9a3f4d376220101e4293e Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 29 Oct 2019 11:07:45 -0700 Subject: [PATCH 04/47] Update tests. --- test/data/test_builtin_datasets.py | 45 ++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 3 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index c16777c214..650f9a3ac0 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -1,6 +1,5 @@ import os import torchtext.data as data -from torchtext.datasets import WikiText2, PennTreebank from torchtext.datasets import AG_NEWS from ..common.test_markers import slow @@ -14,7 +13,8 @@ def conditional_remove(f): class TestDataset(TorchtextTestCase): @slow - def test_wikitext2(self): + def test_wikitext2_legacy(self): + from torchtext.legacy.datasets import WikiText2 # smoke test to ensure wikitext2 works properly ds = WikiText2 TEXT = data.Field(lower=True, batch_first=True) @@ -31,8 +31,26 @@ def test_wikitext2(self): datafile = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(datafile) + def test_wikitext2(self): + from torchtext.datasets import WikiText2 + # smoke test to ensure wikitext2 works properly + train_dataset, test_dataset, valid_dataset = WikiText2() + self.assertEqual(len(train_dataset), 1947375) + self.assertEqual(len(test_dataset), 230357) + self.assertEqual(len(valid_dataset), 203947) + + vocab = train_dataset.get_vocab() + tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] + self.assertEqual(tokens_ids, [2, 285, 502, 699]) + + # Delete the dataset after we're done to save disk space on CI + if os.environ.get("TRAVIS") == "true": + datafile = os.path.join(self.project_root, ".data", "wikitext-2") + conditional_remove(datafile) + @slow - def test_penntreebank(self): + def test_penntreebank_legacy(self): + from torchtext.legacy.datasets import PennTreebank # smoke test to ensure penn treebank works properly TEXT = data.Field(lower=True, batch_first=True) ds = PennTreebank @@ -49,6 +67,27 @@ def test_penntreebank(self): datafile = os.path.join(self.project_root, ".data", "penn-treebank") conditional_remove(datafile) + def test_penntreebank(self): + from torchtext.datasets import PennTreebank + # smoke test to ensure wikitext2 works properly + train_dataset, test_dataset, valid_dataset = PennTreebank() + self.assertEqual(len(train_dataset), 924412) + self.assertEqual(len(test_dataset), 82114) + self.assertEqual(len(valid_dataset), 73339) + + vocab = train_dataset.get_vocab() + tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] + self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) + + # Delete the dataset after we're done to save disk space on CI + if os.environ.get("TRAVIS") == "true": + datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.ttest.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') + conditional_remove(datafile) + def test_text_classification(self): # smoke test to ensure ag_news dataset works properly From 97cfd05260352c33533904a094ef5425e2a1c68c Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 29 Oct 2019 11:16:41 -0700 Subject: [PATCH 05/47] Move legacy docs for language modeling dataset. --- docs/source/legacy/datasets.rst | 69 +++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 docs/source/legacy/datasets.rst diff --git a/docs/source/legacy/datasets.rst b/docs/source/legacy/datasets.rst new file mode 100644 index 0000000000..859b5df5da --- /dev/null +++ b/docs/source/legacy/datasets.rst @@ -0,0 +1,69 @@ +torchtext.legacy.datasets +==================== + +.. currentmodule:: torchtext.legacy.datasets + +TorchText legacy datasets. + +All datasets are subclasses of :class:`torchtext.data.Dataset`, which +inherits from :class:`torch.utils.data.Dataset` i.e, they have ``split`` and +``iters`` methods implemented. + +General use cases are as follows: + +Approach 1, ``splits``: :: + + # set up fields + TEXT = data.Field(lower=True, include_lengths=True, batch_first=True) + LABEL = data.Field(sequential=False) + + # make splits for data + train, test = datasets.IMDB.splits(TEXT, LABEL) + + # build the vocabulary + TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) + LABEL.build_vocab(train) + + # make iterator for splits + train_iter, test_iter = data.BucketIterator.splits( + (train, test), batch_size=3, device=0) + +Approach 2, ``iters``: :: + + # use default configurations + train_iter, test_iter = datasets.IMDB.iters(batch_size=4) + +The following datasets are available: + +.. contents:: Datasets + :local: + + +Language Modeling +^^^^^^^^^^^^^^^^^ + +Language modeling datasets are subclasses of ``LanguageModelingDataset`` class. + +.. autoclass:: LanguageModelingDataset + :members: __init__ + + +WikiText-2 +~~~~~~~~~~ + +.. autoclass:: WikiText2 + :members: splits, iters + + +WikiText103 +~~~~~~~~~~~ + +.. autoclass:: WikiText103 + :members: splits, iters + + +PennTreebank +~~~~~~~~~~~~ + +.. autoclass:: PennTreebank + :members: splits, iters From 0ac3e180dc0ffb35dceefdecf916dc3fce299dbf Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 29 Oct 2019 11:59:03 -0700 Subject: [PATCH 06/47] Update docs. --- torchtext/datasets/language_modeling.py | 97 ++++++++++++++++++++++++- 1 file changed, 94 insertions(+), 3 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 4eb50080dd..a0393fb3c9 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -46,14 +46,33 @@ def _create_data_from_iterator(vocab, iterator, include_unk): class LanguageModelingDataset(torch.utils.data.Dataset): - """Defines a dataset for language modeling.""" + """Defines a dataset for language modeling. + Currently, we only support the following datasets: + + - WikiText2 + - WikiText103 + - PennTreebank + + """ def __init__(self, data, vocab): - """Create a LanguageModelingDataset given a path and a field. + """Initiate language modeling dataset. Arguments: - path: Path to the data file. + data: a tensor of tokens. tokens are ids after + numericalizing the string tokens. + torch.Tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() + vocab: Vocabulary object used for dataset. + + Examples: + >>> from torchtext.vocab import build_vocab_from_iterator + >>> data = torch.Tensor([token_id_1, token_id_2, + token_id_3, token_id_1]).long() + >>> vocab = build_vocab_from_iterator([['language', 'modeling']]) + >>> dataset = LanguageModelingDataset(data, vocab) + """ + super(LanguageModelingDataset, self).__init__() self._data = data self._vocab = vocab @@ -111,12 +130,84 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), def WikiText2(*args, **kwargs): + """ Defines WikiText2 datasets. + + Create language modeling dataset: WikiText2 + Separately returns the train/test/valid set + + Arguments: + tokenizer: the tokenizer used to preprocess raw text data. + The default one is basic_english tokenizer in fastText. spacy tokenizer + is supported as well (see example below). A custom tokenizer is callable + function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. + include_unk: include unknown token in the data (Default: False) + + Examples: + >>> from torchtext.datasets import WikiText2 + >>> from torchtext.data.utils import get_tokenizer + >>> tokenizer = get_tokenizer("spacy") + >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) + >>> vocab = train_dataset.get_vocab() + + """ + return _setup_datasets(*(("WikiText2",) + args), **kwargs) def WikiText103(*args, **kwargs): + """ Defines WikiText103 datasets. + + Create language modeling dataset: WikiText103 + Separately returns the train/test/valid set + + Arguments: + tokenizer: the tokenizer used to preprocess raw text data. + The default one is basic_english tokenizer in fastText. spacy tokenizer + is supported as well (see example below). A custom tokenizer is callable + function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. + include_unk: include unknown token in the data (Default: False) + + Examples: + >>> from torchtext.datasets import WikiText103 + >>> from torchtext.data.utils import get_tokenizer + >>> tokenizer = get_tokenizer("spacy") + >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) + >>> vocab = train_dataset.get_vocab() + + """ + return _setup_datasets(*(("WikiText103",) + args), **kwargs) def PennTreebank(*args, **kwargs): + """ Defines PennTreebank datasets. + + Create language modeling dataset: PennTreebank + Separately returns the train/test/valid set + + Arguments: + tokenizer: the tokenizer used to preprocess raw text data. + The default one is basic_english tokenizer in fastText. spacy tokenizer + is supported as well (see example below). A custom tokenizer is callable + function with input of a string and output of a token list. + root: Directory where the datasets are saved. Default: ".data" + vocab: Vocabulary used for dataset. If None, it will generate a new + vocabulary based on the train data set. + include_unk: include unknown token in the data (Default: False) + + Examples: + >>> from torchtext.datasets import PennTreebank + >>> from torchtext.data.utils import get_tokenizer + >>> tokenizer = get_tokenizer("spacy") + >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) + >>> vocab = train_dataset.get_vocab() + + """ + return _setup_datasets(*(("PennTreebank",) + args), **kwargs) From 56046fa0231cf24ae237453d30afec84448f713d Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 31 Oct 2019 09:01:58 -0700 Subject: [PATCH 07/47] Minor debug --- torchtext/legacy/datasets/language_modeling.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torchtext/legacy/datasets/language_modeling.py b/torchtext/legacy/datasets/language_modeling.py index ed7b912efc..6b814dfe1e 100644 --- a/torchtext/legacy/datasets/language_modeling.py +++ b/torchtext/legacy/datasets/language_modeling.py @@ -22,6 +22,10 @@ def __init__(self, path, text_field, newline_eos=True, "by the PyTorch team now !") fields = [('text', text_field)] text = [] + import os + print(os.listdir(".data/")) + print(os.listdir(".data/wikitext-2")) + print(os.listdir(".data/wikitext-2/wikitext-2")) with io.open(path, encoding=encoding) as f: for line in f: text += text_field.preprocess(line) From 9962732ffc87576937c1bdd7dcead8a468072785 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 31 Oct 2019 10:39:32 -0700 Subject: [PATCH 08/47] Update test. --- test/data/test_builtin_datasets.py | 10 +++++++++- torchtext/legacy/datasets/language_modeling.py | 6 +++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 650f9a3ac0..615abde260 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -1,4 +1,5 @@ import os +import shutil import torchtext.data as data from torchtext.datasets import AG_NEWS @@ -9,6 +10,8 @@ def conditional_remove(f): if os.path.isfile(f): os.remove(f) + elif os.path.isdir(f): + shutil.rmtree(f) class TestDataset(TorchtextTestCase): @@ -47,6 +50,8 @@ def test_wikitext2(self): if os.environ.get("TRAVIS") == "true": datafile = os.path.join(self.project_root, ".data", "wikitext-2") conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") + conditional_remove(datafile) @slow def test_penntreebank_legacy(self): @@ -100,5 +105,8 @@ def test_text_classification(self): # Delete the dataset after we're done to save disk space on CI if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", "AG_NEWS") + datafile = os.path.join(self.project_root, ".data", "ag_news_csv") + conditional_remove(datafile) + + datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") conditional_remove(datafile) diff --git a/torchtext/legacy/datasets/language_modeling.py b/torchtext/legacy/datasets/language_modeling.py index 6b814dfe1e..a22f478bb8 100644 --- a/torchtext/legacy/datasets/language_modeling.py +++ b/torchtext/legacy/datasets/language_modeling.py @@ -23,9 +23,9 @@ def __init__(self, path, text_field, newline_eos=True, fields = [('text', text_field)] text = [] import os - print(os.listdir(".data/")) - print(os.listdir(".data/wikitext-2")) - print(os.listdir(".data/wikitext-2/wikitext-2")) + print(".data/ ", os.listdir(".data/")) + print(".data/wikitext-2 ", os.listdir(".data/wikitext-2")) + print(".data/wikitext-2/wikitext-2 ", os.listdir(".data/wikitext-2/wikitext-2")) with io.open(path, encoding=encoding) as f: for line in f: text += text_field.preprocess(line) From ad7938ed6864e49d146df31fb23d32b8fc61f876 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 31 Oct 2019 11:51:31 -0700 Subject: [PATCH 09/47] Minor change in tests. --- test/data/test_builtin_datasets.py | 3 +-- torchtext/legacy/datasets/language_modeling.py | 4 ---- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 615abde260..efd65310b4 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -88,7 +88,7 @@ def test_penntreebank(self): if os.environ.get("TRAVIS") == "true": datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.ttest.txt') + datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') conditional_remove(datafile) datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') conditional_remove(datafile) @@ -107,6 +107,5 @@ def test_text_classification(self): if os.environ.get("TRAVIS") == "true": datafile = os.path.join(self.project_root, ".data", "ag_news_csv") conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") conditional_remove(datafile) diff --git a/torchtext/legacy/datasets/language_modeling.py b/torchtext/legacy/datasets/language_modeling.py index a22f478bb8..ed7b912efc 100644 --- a/torchtext/legacy/datasets/language_modeling.py +++ b/torchtext/legacy/datasets/language_modeling.py @@ -22,10 +22,6 @@ def __init__(self, path, text_field, newline_eos=True, "by the PyTorch team now !") fields = [('text', text_field)] text = [] - import os - print(".data/ ", os.listdir(".data/")) - print(".data/wikitext-2 ", os.listdir(".data/wikitext-2")) - print(".data/wikitext-2/wikitext-2 ", os.listdir(".data/wikitext-2/wikitext-2")) with io.open(path, encoding=encoding) as f: for line in f: text += text_field.preprocess(line) From 3ff1cce0a225c0757977856b281f303077cca2fa Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 31 Oct 2019 11:52:20 -0700 Subject: [PATCH 10/47] Flake8 --- test/data/test_builtin_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index efd65310b4..acd57e2528 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -1,5 +1,5 @@ import os -import shutil +import shutil import torchtext.data as data from torchtext.datasets import AG_NEWS From cc1ae4d81aba271e7bfb5aae71a24a1f9404d52e Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 5 Nov 2019 09:00:55 -0800 Subject: [PATCH 11/47] Move two funct to data/functional.py. --- torchtext/data/__init__.py | 6 ++- torchtext/data/functional.py | 59 ++++++++++++++++++++++++- torchtext/datasets/language_modeling.py | 44 ++++-------------- 3 files changed, 71 insertions(+), 38 deletions(-) diff --git a/torchtext/data/__init__.py b/torchtext/data/__init__.py index ea0096c6c2..5775326a13 100644 --- a/torchtext/data/__init__.py +++ b/torchtext/data/__init__.py @@ -10,7 +10,8 @@ from .functional import generate_sp_model, \ load_sp_model, \ sentencepiece_numericalizer, \ - sentencepiece_tokenizer, custom_replace, simple_space_split + sentencepiece_tokenizer, custom_replace, simple_space_split, \ + read_text_iterator, create_data_from_iterator __all__ = ["Batch", "Dataset", "TabularDataset", @@ -24,4 +25,5 @@ "get_tokenizer", "interleave_keys", "generate_sp_model", "load_sp_model", "sentencepiece_numericalizer", "sentencepiece_tokenizer", - "custom_replace", "simple_space_split"] + "custom_replace", "simple_space_split", + "read_text_iterator", "create_data_from_iterator"] diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 07c16e5a2d..f2085bb9c7 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -1,6 +1,10 @@ +import torch import sentencepiece as spm import re - +from torchtext.utils import unicode_csv_reader +import logging +import io +from torchtext.vocab import Vocab __all__ = [ "generate_sp_model", "load_sp_model", @@ -151,3 +155,56 @@ def simple_space_split(iterator): for line in iterator: yield line.split() + + +def read_text_iterator(path, tokenizer): + r"""Read text from path and yield a list of tokens based on the tokenizer + + Arguments: + path: the file path. + tokenizer: the tokenizer used to tokenize string text. + + Examples: + >>> from torchtext.data.functional import read_text_iterator + >>> tokenizer = get_tokenizer("basic_english") + >>> list((read_text_iterator('.data/ptb.train.txt', tokenizer))) + [['Sentencepiece', 'encode', 'as', 'pieces'], ['example', 'to', 'try!']] + """ + + with io.open(path, encoding="utf8") as f: + reader = unicode_csv_reader(f) + for row in reader: + tokens = tokenizer(' '.join(row)) + yield tokens + + +def create_data_from_iterator(vocab, iterator, include_unk): + r"""Create data from an token iterator. + + Arguments: + vocab: the vocabulary convert token into id. + iterator: the iterator yield a list of tokens. + include_unk: option to include unk token. + + Examples: + >>> from torchtext.data.functional import simple_space_split + >>> from torchtext.data.functional import create_data_from_iterator + >>> vocab = {'Sentencepiece' : 0, 'encode' : 1, 'as' : 2, 'pieces' : 3} + >>> create_data_from_iterator(vocab, + >>> simple_space_split(["Sentencepiece as pieces"]), + >>> False) + >>> tensor([0, 2, 3]) + """ + + _data = [] + for tokens in iterator: + if include_unk: + tokens = [vocab[token] for token in tokens] + else: + token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] + for token in tokens])) + tokens = token_ids + if len(tokens) == 0: + logging.info('Row contains no tokens.') + _data += tokens + return torch.Tensor(_data).long() diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index a0393fb3c9..feb3b7ac47 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -1,12 +1,11 @@ import torch import logging -import io import os -from torchtext.utils import download_from_url, extract_archive, unicode_csv_reader +from torchtext.utils import download_from_url, extract_archive from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer from torchtext.vocab import Vocab -from tqdm import tqdm +from torchtext.data.functional import read_text_iterator, create_data_from_iterator URLS = { 'WikiText2': @@ -20,31 +19,6 @@ } -def _read_text_iterator(data_path, tokenizer): - with io.open(data_path, encoding="utf8") as f: - reader = unicode_csv_reader(f) - for row in reader: - tokens = tokenizer(' '.join(row)) - yield tokens - - -def _create_data_from_iterator(vocab, iterator, include_unk): - _data = [] - with tqdm(unit_scale=0, unit='lines') as t: - for tokens in iterator: - if include_unk: - tokens = [vocab[token] for token in tokens] - else: - token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] - for token in tokens])) - tokens = token_ids - if len(tokens) == 0: - logging.info('Row contains no tokens.') - _data += tokens - t.update(1) - return torch.Tensor(_data).long() - - class LanguageModelingDataset(torch.utils.data.Dataset): """Defines a dataset for language modeling. Currently, we only support the following datasets: @@ -110,20 +84,20 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if vocab is None: logging.info('Building Vocab based on {}'.format(train_path)) - vocab = build_vocab_from_iterator(_read_text_iterator(train_path, tokenizer)) + vocab = build_vocab_from_iterator(read_text_iterator(train_path, tokenizer)) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') - train_data = _create_data_from_iterator( - vocab, _read_text_iterator(train_path, tokenizer), include_unk) + train_data = create_data_from_iterator( + vocab, read_text_iterator(train_path, tokenizer), include_unk) logging.info('Creating testing data') - test_data = _create_data_from_iterator( - vocab, _read_text_iterator(test_path, tokenizer), include_unk) + test_data = create_data_from_iterator( + vocab, read_text_iterator(test_path, tokenizer), include_unk) logging.info('Creating valid data') - valid_data = _create_data_from_iterator( - vocab, _read_text_iterator(valid_path, tokenizer), include_unk) + valid_data = create_data_from_iterator( + vocab, read_text_iterator(valid_path, tokenizer), include_unk) return (LanguageModelingDataset(train_data, vocab), LanguageModelingDataset(test_data, vocab), LanguageModelingDataset(valid_data, vocab)) From f4018ccdd1174c53a3dec5ff5655f16c1fff9dab Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 5 Nov 2019 12:30:02 -0800 Subject: [PATCH 12/47] Fix <'unk'> compability issue. --- torchtext/data/functional.py | 10 +++++----- torchtext/datasets/language_modeling.py | 14 +++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index f2085bb9c7..5d5fc45c25 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -178,13 +178,13 @@ def read_text_iterator(path, tokenizer): yield tokens -def create_data_from_iterator(vocab, iterator, include_unk): +def create_data_from_iterator(vocab, iterator, removed_tokens=None): r"""Create data from an token iterator. Arguments: vocab: the vocabulary convert token into id. iterator: the iterator yield a list of tokens. - include_unk: option to include unk token. + removed_tokens: removed tokens from output dataset (Default: None) Examples: >>> from torchtext.data.functional import simple_space_split @@ -192,16 +192,16 @@ def create_data_from_iterator(vocab, iterator, include_unk): >>> vocab = {'Sentencepiece' : 0, 'encode' : 1, 'as' : 2, 'pieces' : 3} >>> create_data_from_iterator(vocab, >>> simple_space_split(["Sentencepiece as pieces"]), - >>> False) + >>> removed_tokens=['']) >>> tensor([0, 2, 3]) """ _data = [] for tokens in iterator: - if include_unk: + if removed_tokens is None: tokens = [vocab[token] for token in tokens] else: - token_ids = list(filter(lambda x: x is not Vocab.UNK, [vocab[token] + token_ids = list(filter(lambda x: x not in removed_tokens, [vocab[token] for token in tokens])) tokens = token_ids if len(tokens) == 0: diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index feb3b7ac47..29517c70fc 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -66,7 +66,7 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', vocab=None, include_unk=False): + root='.data', vocab=None, removed_tokens=['']): if dataset_name == 'PennTreebank': train_path = download_from_url(URLS['PennTreebank'][0], root=root) test_path = download_from_url(URLS['PennTreebank'][1], root=root) @@ -91,13 +91,13 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') train_data = create_data_from_iterator( - vocab, read_text_iterator(train_path, tokenizer), include_unk) + vocab, read_text_iterator(train_path, tokenizer), removed_tokens) logging.info('Creating testing data') test_data = create_data_from_iterator( - vocab, read_text_iterator(test_path, tokenizer), include_unk) + vocab, read_text_iterator(test_path, tokenizer), removed_tokens) logging.info('Creating valid data') valid_data = create_data_from_iterator( - vocab, read_text_iterator(valid_path, tokenizer), include_unk) + vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) return (LanguageModelingDataset(train_data, vocab), LanguageModelingDataset(test_data, vocab), LanguageModelingDataset(valid_data, vocab)) @@ -117,7 +117,7 @@ def WikiText2(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) + removed_tokens: removed tokens from output dataset (Default: '') Examples: >>> from torchtext.datasets import WikiText2 @@ -145,7 +145,7 @@ def WikiText103(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) + removed_tokens: removed tokens from output dataset (Default: '') Examples: >>> from torchtext.datasets import WikiText103 @@ -173,7 +173,7 @@ def PennTreebank(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - include_unk: include unknown token in the data (Default: False) + removed_tokens: removed tokens from output dataset (Default: '') Examples: >>> from torchtext.datasets import PennTreebank From ff329f952e0f24253054ad55c6e8de03da157435 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 5 Nov 2019 13:05:10 -0800 Subject: [PATCH 13/47] Minor changes. --- torchtext/data/functional.py | 16 ++++++---------- torchtext/datasets/language_modeling.py | 24 ++++++++++++++++++------ 2 files changed, 24 insertions(+), 16 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 5d5fc45c25..d2aea53eda 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -1,10 +1,8 @@ -import torch import sentencepiece as spm import re from torchtext.utils import unicode_csv_reader import logging import io -from torchtext.vocab import Vocab __all__ = [ "generate_sp_model", "load_sp_model", @@ -179,7 +177,7 @@ def read_text_iterator(path, tokenizer): def create_data_from_iterator(vocab, iterator, removed_tokens=None): - r"""Create data from an token iterator. + r"""Yield a list of ids from an token iterator with a vocab. Arguments: vocab: the vocabulary convert token into id. @@ -190,13 +188,12 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): >>> from torchtext.data.functional import simple_space_split >>> from torchtext.data.functional import create_data_from_iterator >>> vocab = {'Sentencepiece' : 0, 'encode' : 1, 'as' : 2, 'pieces' : 3} - >>> create_data_from_iterator(vocab, - >>> simple_space_split(["Sentencepiece as pieces"]), - >>> removed_tokens=['']) - >>> tensor([0, 2, 3]) + >>> list(create_data_from_iterator(vocab, + >>> simple_space_split(["Sentencepiece as pieces", + >>> "as pieces"])) + >>> [[0, 2, 3], [2, 3]] """ - _data = [] for tokens in iterator: if removed_tokens is None: tokens = [vocab[token] for token in tokens] @@ -206,5 +203,4 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): tokens = token_ids if len(tokens) == 0: logging.info('Row contains no tokens.') - _data += tokens - return torch.Tensor(_data).long() + yield tokens diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 29517c70fc..37fd7d5b71 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -90,17 +90,29 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), raise TypeError("Passed vocabulary is not of type Vocab") logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') - train_data = create_data_from_iterator( + train_iter = create_data_from_iterator( vocab, read_text_iterator(train_path, tokenizer), removed_tokens) + train_data = [] + for tokens in train_iter: + train_data += tokens + logging.info('Creating testing data') - test_data = create_data_from_iterator( + test_iter = create_data_from_iterator( vocab, read_text_iterator(test_path, tokenizer), removed_tokens) + test_data = [] + for tokens in test_iter: + test_data += tokens + logging.info('Creating valid data') - valid_data = create_data_from_iterator( + valid_iter = create_data_from_iterator( vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) - return (LanguageModelingDataset(train_data, vocab), - LanguageModelingDataset(test_data, vocab), - LanguageModelingDataset(valid_data, vocab)) + valid_data = [] + for tokens in valid_iter: + valid_data += tokens + + return (LanguageModelingDataset(torch.Tensor(train_data).long(), vocab), + LanguageModelingDataset(torch.Tensor(test_data).long(), vocab), + LanguageModelingDataset(torch.Tensor(valid_data).long(), vocab)) def WikiText2(*args, **kwargs): From 65c470cbf27edddebec66b7e7d88d3b477238bc3 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 5 Nov 2019 15:03:22 -0800 Subject: [PATCH 14/47] Update unit tests. --- test/data/test_builtin_datasets.py | 41 +++++++++++++----------------- 1 file changed, 18 insertions(+), 23 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index acd57e2528..7b67f68652 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -30,9 +30,8 @@ def test_wikitext2_legacy(self): bptt_len=30) # Delete the dataset after we're done to save disk space on CI - if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", "wikitext-2") - conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "wikitext-2") + conditional_remove(datafile) def test_wikitext2(self): from torchtext.datasets import WikiText2 @@ -47,11 +46,10 @@ def test_wikitext2(self): self.assertEqual(tokens_ids, [2, 285, 502, 699]) # Delete the dataset after we're done to save disk space on CI - if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", "wikitext-2") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") - conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "wikitext-2") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "wikitext-2-v1.zip") + conditional_remove(datafile) @slow def test_penntreebank_legacy(self): @@ -68,9 +66,8 @@ def test_penntreebank_legacy(self): bptt_len=30) # Delete the dataset after we're done to save disk space on CI - if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", "penn-treebank") - conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "penn-treebank") + conditional_remove(datafile) def test_penntreebank(self): from torchtext.datasets import PennTreebank @@ -85,13 +82,12 @@ def test_penntreebank(self): self.assertEqual(tokens_ids, [2, 2550, 3344, 1125]) # Delete the dataset after we're done to save disk space on CI - if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') - conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.train.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.test.txt') + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", 'ptb.valid.txt') + conditional_remove(datafile) def test_text_classification(self): # smoke test to ensure ag_news dataset works properly @@ -104,8 +100,7 @@ def test_text_classification(self): self.assertEqual(len(ag_news_test), 7600) # Delete the dataset after we're done to save disk space on CI - if os.environ.get("TRAVIS") == "true": - datafile = os.path.join(self.project_root, ".data", "ag_news_csv") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") - conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "ag_news_csv") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "ag_news_csv.tar.gz") + conditional_remove(datafile) From 25336b948205622b9cddf395d35f4fead44a4174 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 11 Nov 2019 10:04:04 -0800 Subject: [PATCH 15/47] Minor change --- torchtext/datasets/language_modeling.py | 1 + 1 file changed, 1 insertion(+) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 37fd7d5b71..19d53ba82a 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -88,6 +88,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") + logging.info('Vocab has {} entries'.format(len(vocab))) logging.info('Creating training data') train_iter = create_data_from_iterator( From 4819f1836c8cd12136144ae053528f541e17cbb4 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 18 Nov 2019 11:54:13 -0800 Subject: [PATCH 16/47] Add flags for train/valid/test/ --- torchtext/datasets/language_modeling.py | 58 +++++++++++++++---------- 1 file changed, 34 insertions(+), 24 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 19d53ba82a..019165e9d3 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -66,7 +66,9 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', vocab=None, removed_tokens=['']): + root='.data', vocab=None, removed_tokens=[''], + train_filename='train', test_filename='test', + valid_filename='valid'): if dataset_name == 'PennTreebank': train_path = download_from_url(URLS['PennTreebank'][0], root=root) test_path = download_from_url(URLS['PennTreebank'][1], root=root) @@ -74,46 +76,54 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) + + train_path = None + test_path = None + valid_path = None for fname in extracted_files: - if 'train' in fname: + if train_filename in fname: train_path = os.path.join(root, fname) - if 'test' in fname: + if test_filename in fname: test_path = os.path.join(root, fname) - if 'valid' in fname: + if valid_filename in fname: valid_path = os.path.join(root, fname) if vocab is None: logging.info('Building Vocab based on {}'.format(train_path)) + if train_path is None: + raise TypeError("Train file is not defined correctly to generate vocabulary") vocab = build_vocab_from_iterator(read_text_iterator(train_path, tokenizer)) + logging.info('Vocab has {} entries'.format(len(vocab))) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") - logging.info('Vocab has {} entries'.format(len(vocab))) - logging.info('Creating training data') - train_iter = create_data_from_iterator( - vocab, read_text_iterator(train_path, tokenizer), removed_tokens) train_data = [] - for tokens in train_iter: - train_data += tokens + if train_path is not None: + logging.info('Creating training data') + train_iter = create_data_from_iterator( + vocab, read_text_iterator(train_path, tokenizer), removed_tokens) + for tokens in train_iter: + train_data += tokens - logging.info('Creating testing data') - test_iter = create_data_from_iterator( - vocab, read_text_iterator(test_path, tokenizer), removed_tokens) test_data = [] - for tokens in test_iter: - test_data += tokens + if test_path is not None: + logging.info('Creating testing data') + test_iter = create_data_from_iterator( + vocab, read_text_iterator(test_path, tokenizer), removed_tokens) + for tokens in test_iter: + test_data += tokens - logging.info('Creating valid data') - valid_iter = create_data_from_iterator( - vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) valid_data = [] - for tokens in valid_iter: - valid_data += tokens - - return (LanguageModelingDataset(torch.Tensor(train_data).long(), vocab), - LanguageModelingDataset(torch.Tensor(test_data).long(), vocab), - LanguageModelingDataset(torch.Tensor(valid_data).long(), vocab)) + if valid_path is not None: + logging.info('Creating valid data') + valid_iter = create_data_from_iterator( + vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) + for tokens in valid_iter: + valid_data += tokens + + return tuple(LanguageModelingDataset(torch.Tensor(d).long(), vocab) + for d in (train_data, valid_data, test_data) if d != []) def WikiText2(*args, **kwargs): From 48cb0a8b5b360437dbd3134b6b8463415ef5a2d7 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Tue, 19 Nov 2019 10:10:53 -0800 Subject: [PATCH 17/47] Update docs. --- torchtext/datasets/language_modeling.py | 28 +++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 019165e9d3..f641c5e379 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -81,11 +81,11 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), test_path = None valid_path = None for fname in extracted_files: - if train_filename in fname: + if train_filename and train_filename in fname: train_path = os.path.join(root, fname) - if test_filename in fname: + if test_filename and test_filename in fname: test_path = os.path.join(root, fname) - if valid_filename in fname: + if valid_filename and valid_filename in fname: valid_path = os.path.join(root, fname) if vocab is None: @@ -123,7 +123,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), valid_data += tokens return tuple(LanguageModelingDataset(torch.Tensor(d).long(), vocab) - for d in (train_data, valid_data, test_data) if d != []) + for d in (train_data, test_data, valid_data) if d != []) def WikiText2(*args, **kwargs): @@ -141,6 +141,14 @@ def WikiText2(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: '') + train_filename: the filename for train (Default: 'train'). If set to None, + train dataset will not be generated. + test_filename: the filename for test (Default: 'test'). If set to None, + test dataset will not be generated. If train_filename is set to None, a + vocab object is required to generate test dataset. + valid_filename: the filename for valid (Default: 'valid'). If set to None, + valid dataset will not be generated. If train_filename is set to None, a + vocab object is required to generate valid dataset. Examples: >>> from torchtext.datasets import WikiText2 @@ -148,6 +156,8 @@ def WikiText2(*args, **kwargs): >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() + >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, + train_filename=None, test_filename=None) """ @@ -169,6 +179,14 @@ def WikiText103(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: '') + train_filename: the filename for train (Default: 'train'). If set to None, + train dataset will not be generated. + test_filename: the filename for test (Default: 'test'). If set to None, + test dataset will not be generated. If train_filename is set to None, a + vocab object is required to generate test dataset. + valid_filename: the filename for valid (Default: 'valid'). If set to None, + valid dataset will not be generated. If train_filename is set to None, a + vocab object is required to generate valid dataset. Examples: >>> from torchtext.datasets import WikiText103 @@ -176,6 +194,8 @@ def WikiText103(*args, **kwargs): >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() + >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, + train_filename=None, test_filename=None) """ From 7d70298ccfd417e931d2f7ffce067f464dbc1f32 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 20 Nov 2019 14:23:42 -0800 Subject: [PATCH 18/47] Add returned_dataset flag to determin subset data. --- torchtext/datasets/language_modeling.py | 66 ++++++++++++++----------- 1 file changed, 37 insertions(+), 29 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index f641c5e379..905cf218ed 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -67,25 +67,27 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[''], - train_filename='train', test_filename='test', - valid_filename='valid'): + returned_datasets=('train', 'test', 'valid')): + train_path = None + test_path = None + valid_path = None if dataset_name == 'PennTreebank': - train_path = download_from_url(URLS['PennTreebank'][0], root=root) - test_path = download_from_url(URLS['PennTreebank'][1], root=root) - valid_path = download_from_url(URLS['PennTreebank'][2], root=root) + if 'train' in returned_datasets: + train_path = download_from_url(URLS['PennTreebank'][0], root=root) + if 'test' in returned_datasets: + test_path = download_from_url(URLS['PennTreebank'][1], root=root) + if 'valid' in returned_datasets: + valid_path = download_from_url(URLS['PennTreebank'][2], root=root) else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) - train_path = None - test_path = None - valid_path = None for fname in extracted_files: - if train_filename and train_filename in fname: + if 'train' in returned_datasets: train_path = os.path.join(root, fname) - if test_filename and test_filename in fname: + if 'test' in returned_datasets: test_path = os.path.join(root, fname) - if valid_filename and valid_filename in fname: + if 'valid' in returned_datasets: valid_path = os.path.join(root, fname) if vocab is None: @@ -141,14 +143,11 @@ def WikiText2(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: '') - train_filename: the filename for train (Default: 'train'). If set to None, - train dataset will not be generated. - test_filename: the filename for test (Default: 'test'). If set to None, - test dataset will not be generated. If train_filename is set to None, a - vocab object is required to generate test dataset. - valid_filename: the filename for valid (Default: 'valid'). If set to None, - valid dataset will not be generated. If train_filename is set to None, a - vocab object is required to generate valid dataset. + returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test'). + If 'train' is not in the tuple, an vocab object should be provided which will + be used to process valid and/or test data. Examples: >>> from torchtext.datasets import WikiText2 @@ -157,7 +156,7 @@ def WikiText2(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, - train_filename=None, test_filename=None) + returned_datasets=('valid')) """ @@ -178,15 +177,17 @@ def WikiText103(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. + returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test'). + If 'train' is not in the tuple, an vocab object should be provided which will + be used to process valid and/or test data. removed_tokens: removed tokens from output dataset (Default: '') - train_filename: the filename for train (Default: 'train'). If set to None, - train dataset will not be generated. - test_filename: the filename for test (Default: 'test'). If set to None, - test dataset will not be generated. If train_filename is set to None, a - vocab object is required to generate test dataset. - valid_filename: the filename for valid (Default: 'valid'). If set to None, - valid dataset will not be generated. If train_filename is set to None, a - vocab object is required to generate valid dataset. + returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test'). + If 'train' is not in the tuple, an vocab object should be provided which will + be used to process valid and/or test data. Examples: >>> from torchtext.datasets import WikiText103 @@ -195,7 +196,7 @@ def WikiText103(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, - train_filename=None, test_filename=None) + returned_datasets=('valid')) """ @@ -217,6 +218,11 @@ def PennTreebank(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: '') + returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test'). + If 'train' is not in the tuple, an vocab object should be provided which will + be used to process valid and/or test data. Examples: >>> from torchtext.datasets import PennTreebank @@ -224,6 +230,8 @@ def PennTreebank(*args, **kwargs): >>> tokenizer = get_tokenizer("spacy") >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() + >>> valid_dataset, = PennTreebank(tokenizer=tokenizer, vocab=vocab, + returned_datasets=('valid')) """ From 0588f1d40b30c5fe1a42c79667f50792d738de4f Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Wed, 20 Nov 2019 15:27:35 -0800 Subject: [PATCH 19/47] A small bug. --- torchtext/datasets/language_modeling.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 905cf218ed..53133554ef 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -83,11 +83,11 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), extracted_files = extract_archive(dataset_tar) for fname in extracted_files: - if 'train' in returned_datasets: + if 'train' in returned_datasets and 'train' in fname: train_path = os.path.join(root, fname) - if 'test' in returned_datasets: + elif 'test' in returned_datasets and 'test' in fname: test_path = os.path.join(root, fname) - if 'valid' in returned_datasets: + elif 'valid' in returned_datasets and 'valid' in fname: valid_path = os.path.join(root, fname) if vocab is None: @@ -123,7 +123,9 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) for tokens in valid_iter: valid_data += tokens - + print("len(train_data): ", len(train_data)) + print("len(test_data): ", len(test_data)) + print("len(valid_data): ", len(valid_data)) return tuple(LanguageModelingDataset(torch.Tensor(d).long(), vocab) for d in (train_data, test_data, valid_data) if d != []) From f01037db9b9bc3a61f659ac49cc200c075f1fdd0 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 07:25:48 -0800 Subject: [PATCH 20/47] Remove some printout. --- torchtext/datasets/language_modeling.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 53133554ef..02b79439c4 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -123,9 +123,6 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) for tokens in valid_iter: valid_data += tokens - print("len(train_data): ", len(train_data)) - print("len(test_data): ", len(test_data)) - print("len(valid_data): ", len(valid_data)) return tuple(LanguageModelingDataset(torch.Tensor(d).long(), vocab) for d in (train_data, test_data, valid_data) if d != []) From f2ea3f143614003829dd18c28d226ccfc36472ee Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 07:31:47 -0800 Subject: [PATCH 21/47] Remove unk token. --- torchtext/datasets/language_modeling.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 02b79439c4..d3666d675d 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -66,7 +66,7 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), - root='.data', vocab=None, removed_tokens=[''], + root='.data', vocab=None, removed_tokens=[], returned_datasets=('train', 'test', 'valid')): train_path = None test_path = None @@ -141,7 +141,7 @@ def WikiText2(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: '') + removed_tokens: removed tokens from output dataset (Default: []) returned_datasets: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). @@ -181,7 +181,7 @@ def WikiText103(*args, **kwargs): could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will be used to process valid and/or test data. - removed_tokens: removed tokens from output dataset (Default: '') + removed_tokens: removed tokens from output dataset (Default: []) returned_datasets: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). @@ -216,7 +216,7 @@ def PennTreebank(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - removed_tokens: removed tokens from output dataset (Default: '') + removed_tokens: removed tokens from output dataset (Default: []) returned_datasets: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). From a32712d376fbe444ffbcc85c941bfa7c583a1dc9 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 07:37:51 -0800 Subject: [PATCH 22/47] Use data_select. --- torchtext/datasets/language_modeling.py | 28 ++++++++++++------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index d3666d675d..2cbc96698d 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -67,27 +67,27 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], - returned_datasets=('train', 'test', 'valid')): + data_select=('train', 'test', 'valid')): train_path = None test_path = None valid_path = None if dataset_name == 'PennTreebank': - if 'train' in returned_datasets: + if 'train' in data_select: train_path = download_from_url(URLS['PennTreebank'][0], root=root) - if 'test' in returned_datasets: + if 'test' in data_select: test_path = download_from_url(URLS['PennTreebank'][1], root=root) - if 'valid' in returned_datasets: + if 'valid' in data_select: valid_path = download_from_url(URLS['PennTreebank'][2], root=root) else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) for fname in extracted_files: - if 'train' in returned_datasets and 'train' in fname: + if 'train' in data_select and 'train' in fname: train_path = os.path.join(root, fname) - elif 'test' in returned_datasets and 'test' in fname: + elif 'test' in data_select and 'test' in fname: test_path = os.path.join(root, fname) - elif 'valid' in returned_datasets and 'valid' in fname: + elif 'valid' in data_select and 'valid' in fname: valid_path = os.path.join(root, fname) if vocab is None: @@ -142,7 +142,7 @@ def WikiText2(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) - returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + data_select: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will @@ -155,7 +155,7 @@ def WikiText2(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, - returned_datasets=('valid')) + data_select=('valid')) """ @@ -176,13 +176,13 @@ def WikiText103(*args, **kwargs): root: Directory where the datasets are saved. Default: ".data" vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. - returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + data_select: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will be used to process valid and/or test data. removed_tokens: removed tokens from output dataset (Default: []) - returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + data_select: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will @@ -195,7 +195,7 @@ def WikiText103(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, - returned_datasets=('valid')) + data_select=('valid')) """ @@ -217,7 +217,7 @@ def PennTreebank(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) - returned_datasets: the returned datasets (Default: ('train', 'test','valid')) + data_select: the returned datasets (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will @@ -230,7 +230,7 @@ def PennTreebank(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = PennTreebank(tokenizer=tokenizer, vocab=vocab, - returned_datasets=('valid')) + data_select=('valid')) """ From d2172943a41dead3da411bbdc30e2c2d370c9ba4 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 07:52:58 -0800 Subject: [PATCH 23/47] Support a string in data_select. --- torchtext/datasets/language_modeling.py | 40 +++++++++++++++---------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 2cbc96698d..eedbf800ce 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -68,9 +68,13 @@ def get_vocab(self): def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): + train_path = None test_path = None valid_path = None + + if isinstance(data_select, str): + data_select = [data_select] if dataset_name == 'PennTreebank': if 'train' in data_select: train_path = download_from_url(URLS['PennTreebank'][0], root=root) @@ -142,11 +146,13 @@ def WikiText2(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) - data_select: the returned datasets (Default: ('train', 'test','valid')) + data_select: a string or tupel for the returned datasets + (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users - could also choose any one or two of them, for example ('train', 'test'). - If 'train' is not in the tuple, an vocab object should be provided which will - be used to process valid and/or test data. + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. Examples: >>> from torchtext.datasets import WikiText2 @@ -155,7 +161,7 @@ def WikiText2(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText2(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText2(tokenizer=tokenizer, vocab=vocab, - data_select=('valid')) + data_select='valid') """ @@ -182,11 +188,13 @@ def WikiText103(*args, **kwargs): If 'train' is not in the tuple, an vocab object should be provided which will be used to process valid and/or test data. removed_tokens: removed tokens from output dataset (Default: []) - data_select: the returned datasets (Default: ('train', 'test','valid')) + data_select: a string or tupel for the returned datasets + (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users - could also choose any one or two of them, for example ('train', 'test'). - If 'train' is not in the tuple, an vocab object should be provided which will - be used to process valid and/or test data. + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. Examples: >>> from torchtext.datasets import WikiText103 @@ -195,7 +203,7 @@ def WikiText103(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = WikiText103(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = WikiText103(tokenizer=tokenizer, vocab=vocab, - data_select=('valid')) + data_select='valid') """ @@ -217,11 +225,13 @@ def PennTreebank(*args, **kwargs): vocab: Vocabulary used for dataset. If None, it will generate a new vocabulary based on the train data set. removed_tokens: removed tokens from output dataset (Default: []) - data_select: the returned datasets (Default: ('train', 'test','valid')) + data_select: a string or tupel for the returned datasets + (Default: ('train', 'test','valid')) By default, all the three datasets (train, test, valid) are generated. Users - could also choose any one or two of them, for example ('train', 'test'). - If 'train' is not in the tuple, an vocab object should be provided which will - be used to process valid and/or test data. + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. Examples: >>> from torchtext.datasets import PennTreebank @@ -230,7 +240,7 @@ def PennTreebank(*args, **kwargs): >>> train_dataset, test_dataset, valid_dataset = PennTreebank(tokenizer=tokenizer) >>> vocab = train_dataset.get_vocab() >>> valid_dataset, = PennTreebank(tokenizer=tokenizer, vocab=vocab, - data_select=('valid')) + data_select='valid') """ From cb902d47e0df1cd871fac1d2a2dad83120b5f065 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 07:57:30 -0800 Subject: [PATCH 24/47] Use torch.tensor instead of torch.Tensor --- torchtext/datasets/language_modeling.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index eedbf800ce..09a2d6ba00 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -35,12 +35,12 @@ def __init__(self, data, vocab): Arguments: data: a tensor of tokens. tokens are ids after numericalizing the string tokens. - torch.Tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() + torch.tensor([token_id_1, token_id_2, token_id_3, token_id1]).long() vocab: Vocabulary object used for dataset. Examples: >>> from torchtext.vocab import build_vocab_from_iterator - >>> data = torch.Tensor([token_id_1, token_id_2, + >>> data = torch.tensor([token_id_1, token_id_2, token_id_3, token_id_1]).long() >>> vocab = build_vocab_from_iterator([['language', 'modeling']]) >>> dataset = LanguageModelingDataset(data, vocab) @@ -127,7 +127,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) for tokens in valid_iter: valid_data += tokens - return tuple(LanguageModelingDataset(torch.Tensor(d).long(), vocab) + return tuple(LanguageModelingDataset(torch.tensor(d).long(), vocab) for d in (train_data, test_data, valid_data) if d != []) From 3a05197d883647a80ea6a330ada5055e4a04c5ed Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 08:10:17 -0800 Subject: [PATCH 25/47] remove duplicate code. --- torchtext/datasets/language_modeling.py | 64 +++++++++---------------- 1 file changed, 22 insertions(+), 42 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 09a2d6ba00..856223395f 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -69,66 +69,46 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): - train_path = None - test_path = None - valid_path = None - + _path = {} if isinstance(data_select, str): data_select = [data_select] if dataset_name == 'PennTreebank': if 'train' in data_select: - train_path = download_from_url(URLS['PennTreebank'][0], root=root) + _path['train'] = download_from_url(URLS['PennTreebank'][0], root=root) if 'test' in data_select: - test_path = download_from_url(URLS['PennTreebank'][1], root=root) + _path['test'] = download_from_url(URLS['PennTreebank'][1], root=root) if 'valid' in data_select: - valid_path = download_from_url(URLS['PennTreebank'][2], root=root) + _path['valid'] = download_from_url(URLS['PennTreebank'][2], root=root) else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = extract_archive(dataset_tar) - for fname in extracted_files: - if 'train' in data_select and 'train' in fname: - train_path = os.path.join(root, fname) - elif 'test' in data_select and 'test' in fname: - test_path = os.path.join(root, fname) - elif 'valid' in data_select and 'valid' in fname: - valid_path = os.path.join(root, fname) + for item in data_select: + for fname in extracted_files: + if item in fname: + _path[item] = os.path.join(root, fname) if vocab is None: - logging.info('Building Vocab based on {}'.format(train_path)) - if train_path is None: + if 'train' not in _path.keys(): raise TypeError("Train file is not defined correctly to generate vocabulary") - vocab = build_vocab_from_iterator(read_text_iterator(train_path, tokenizer)) + logging.info('Building Vocab based on {}'.format(_path['train'])) + vocab = build_vocab_from_iterator(read_text_iterator(_path['train'], tokenizer)) logging.info('Vocab has {} entries'.format(len(vocab))) else: if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") - train_data = [] - if train_path is not None: - logging.info('Creating training data') - train_iter = create_data_from_iterator( - vocab, read_text_iterator(train_path, tokenizer), removed_tokens) - for tokens in train_iter: - train_data += tokens - - test_data = [] - if test_path is not None: - logging.info('Creating testing data') - test_iter = create_data_from_iterator( - vocab, read_text_iterator(test_path, tokenizer), removed_tokens) - for tokens in test_iter: - test_data += tokens - - valid_data = [] - if valid_path is not None: - logging.info('Creating valid data') - valid_iter = create_data_from_iterator( - vocab, read_text_iterator(valid_path, tokenizer), removed_tokens) - for tokens in valid_iter: - valid_data += tokens - return tuple(LanguageModelingDataset(torch.tensor(d).long(), vocab) - for d in (train_data, test_data, valid_data) if d != []) + _data = {} + for item in _path.keys(): + _data[item] = [] + logging.info('Creating {} data'.format(item)) + _iter = create_data_from_iterator( + vocab, read_text_iterator(_path[item], tokenizer), removed_tokens) + for tokens in _iter: + _data[item] += tokens + + return tuple(LanguageModelingDataset(torch.tensor(_data[d]).long(), vocab) + for d in data_select if _data[d] != []) def WikiText2(*args, **kwargs): From ac993296b7c09e6032829673186759f9a2a0a8b5 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 08:19:38 -0800 Subject: [PATCH 26/47] Minor change in doc. --- torchtext/datasets/language_modeling.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 856223395f..b683f04796 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -72,6 +72,10 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), _path = {} if isinstance(data_select, str): data_select = [data_select] + for item in data_select: + if item not in ('train', 'test', 'valid'): + raise TypeError('{} in data_select is not supported!'.format(item)) + if dataset_name == 'PennTreebank': if 'train' in data_select: _path['train'] = download_from_url(URLS['PennTreebank'][0], root=root) From 3a342c0f7f9065bb0e369af06c6b8fc192c4da14 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 08:48:00 -0800 Subject: [PATCH 27/47] Change the extracted_files. --- torchtext/datasets/language_modeling.py | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index b683f04796..4aad8a296f 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -69,7 +69,6 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): - _path = {} if isinstance(data_select, str): data_select = [data_select] for item in data_select: @@ -77,20 +76,22 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), raise TypeError('{} in data_select is not supported!'.format(item)) if dataset_name == 'PennTreebank': + extracted_files = [] if 'train' in data_select: - _path['train'] = download_from_url(URLS['PennTreebank'][0], root=root) + extracted_files.append(download_from_url(URLS['PennTreebank'][0], root=root)) if 'test' in data_select: - _path['test'] = download_from_url(URLS['PennTreebank'][1], root=root) + extracted_files.append(download_from_url(URLS['PennTreebank'][1], root=root)) if 'valid' in data_select: - _path['valid'] = download_from_url(URLS['PennTreebank'][2], root=root) + extracted_files.append(download_from_url(URLS['PennTreebank'][2], root=root)) else: dataset_tar = download_from_url(URLS[dataset_name], root=root) - extracted_files = extract_archive(dataset_tar) + extracted_files = [os.path.join(root, d) for d in extract_archive(dataset_tar)] - for item in data_select: - for fname in extracted_files: - if item in fname: - _path[item] = os.path.join(root, fname) + _path = {} + for item in data_select: + for fname in extracted_files: + if item in fname: + _path[item] = fname if vocab is None: if 'train' not in _path.keys(): From 149cbc4f237c52e29840feeefd1336d96b60f2ac Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 08:49:40 -0800 Subject: [PATCH 28/47] Docs. --- torchtext/datasets/language_modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 4aad8a296f..70b1ebf91b 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -95,7 +95,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if vocab is None: if 'train' not in _path.keys(): - raise TypeError("Train file is not defined correctly to generate vocabulary") + raise TypeError("Must pass a vocab if train is not selected.") logging.info('Building Vocab based on {}'.format(_path['train'])) vocab = build_vocab_from_iterator(read_text_iterator(_path['train'], tokenizer)) logging.info('Vocab has {} entries'.format(len(vocab))) From 6cfe9c9b18417267b3dfab8469d698ca891c1cf0 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Thu, 21 Nov 2019 08:56:11 -0800 Subject: [PATCH 29/47] get_data_path --- torchtext/datasets/language_modeling.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 70b1ebf91b..093aa37ac1 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -65,6 +65,12 @@ def get_vocab(self): return self._vocab +def _get_datafile_path(key, extracted_files): + for fname in extracted_files: + if key in fname: + return fname + + def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), root='.data', vocab=None, removed_tokens=[], data_select=('train', 'test', 'valid')): @@ -89,9 +95,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), _path = {} for item in data_select: - for fname in extracted_files: - if item in fname: - _path[item] = fname + _path[item] = _get_datafile_path(item, extracted_files) if vocab is None: if 'train' not in _path.keys(): From 297d1cc3cd6acfb6f7949b17ea94441bbbc744e1 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 08:23:42 -0800 Subject: [PATCH 30/47] Remove token. --- torchtext/data/functional.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index d2aea53eda..9c15ff8f5e 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -198,8 +198,9 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): if removed_tokens is None: tokens = [vocab[token] for token in tokens] else: - token_ids = list(filter(lambda x: x not in removed_tokens, [vocab[token] - for token in tokens])) + token_ids = list(filter(lambda x: + x not in [vocab[token] for token in removed_tokens], + [vocab[token] for token in tokens])) tokens = token_ids if len(tokens) == 0: logging.info('Row contains no tokens.') From d548bf6885c4fd131807cd20115a1dd02ab3d07f Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 10:45:24 -0800 Subject: [PATCH 31/47] Replace _data with data. --- torchtext/datasets/language_modeling.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 093aa37ac1..a6ef3fea71 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -48,21 +48,21 @@ def __init__(self, data, vocab): """ super(LanguageModelingDataset, self).__init__() - self._data = data - self._vocab = vocab + self.data = data + self.vocab = vocab def __getitem__(self, i): - return self._data[i] + return self.data[i] def __len__(self): - return len(self._data) + return len(self.data) def __iter__(self): - for x in self._data: + for x in self.data: yield x def get_vocab(self): - return self._vocab + return self.vocab def _get_datafile_path(key, extracted_files): @@ -107,17 +107,17 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if not isinstance(vocab, Vocab): raise TypeError("Passed vocabulary is not of type Vocab") - _data = {} + data = {} for item in _path.keys(): - _data[item] = [] + data[item] = [] logging.info('Creating {} data'.format(item)) _iter = create_data_from_iterator( vocab, read_text_iterator(_path[item], tokenizer), removed_tokens) for tokens in _iter: - _data[item] += tokens + data[item] += tokens - return tuple(LanguageModelingDataset(torch.tensor(_data[d]).long(), vocab) - for d in data_select if _data[d] != []) + return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) + for d in data_select if data[d] != []) def WikiText2(*args, **kwargs): From e77758e642b2698ebb5d12cdb2cb7953adcb8c1c Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 13:09:16 -0800 Subject: [PATCH 32/47] Change create_data_from_iterator to double iter. --- torchtext/data/functional.py | 20 +++++++++----------- torchtext/datasets/language_modeling.py | 2 +- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 9c15ff8f5e..4abce321b5 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -188,20 +188,18 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): >>> from torchtext.data.functional import simple_space_split >>> from torchtext.data.functional import create_data_from_iterator >>> vocab = {'Sentencepiece' : 0, 'encode' : 1, 'as' : 2, 'pieces' : 3} - >>> list(create_data_from_iterator(vocab, - >>> simple_space_split(["Sentencepiece as pieces", + >>> ids_iter = create_data_from_iterator(vocab, + >>> simple_space_split(["Sentencepiece as pieces", >>> "as pieces"])) - >>> [[0, 2, 3], [2, 3]] + >>> for ids in ids_iter: + >>> print([num for num in ids]) + >>> [0, 2, 3] + >>> [2, 3] """ for tokens in iterator: if removed_tokens is None: - tokens = [vocab[token] for token in tokens] + yield iter(vocab[token] for token in tokens) else: - token_ids = list(filter(lambda x: - x not in [vocab[token] for token in removed_tokens], - [vocab[token] for token in tokens])) - tokens = token_ids - if len(tokens) == 0: - logging.info('Row contains no tokens.') - yield tokens + tokens = list(filter(lambda x: x not in removed_tokens, tokens)) + yield iter(vocab[token] for token in tokens) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index a6ef3fea71..8fe5167781 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -114,7 +114,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), _iter = create_data_from_iterator( vocab, read_text_iterator(_path[item], tokenizer), removed_tokens) for tokens in _iter: - data[item] += tokens + data[item] += [token_id for token_id in tokens] return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) for d in data_select if data[d] != []) From 6d49f406d96f5fb42c0e1aa02c597d817399fe9b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 13:14:05 -0800 Subject: [PATCH 33/47] Add select_to_index. --- torchtext/datasets/language_modeling.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 8fe5167781..f6ffd0f4f0 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -83,12 +83,9 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if dataset_name == 'PennTreebank': extracted_files = [] - if 'train' in data_select: - extracted_files.append(download_from_url(URLS['PennTreebank'][0], root=root)) - if 'test' in data_select: - extracted_files.append(download_from_url(URLS['PennTreebank'][1], root=root)) - if 'valid' in data_select: - extracted_files.append(download_from_url(URLS['PennTreebank'][2], root=root)) + select_to_index = {'train': 0, 'test': 1, 'valid': 2} + extracted_files = [download_from_url(URLS['PennTreebank'][select_to_index[key]], + root=root) for key in data_select] else: dataset_tar = download_from_url(URLS[dataset_name], root=root) extracted_files = [os.path.join(root, d) for d in extract_archive(dataset_tar)] From 1f60293a4d1366c8effd51728775d3aa32d23a6d Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 13:20:32 -0800 Subject: [PATCH 34/47] check subset. --- torchtext/datasets/language_modeling.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index f6ffd0f4f0..694417fafe 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -77,9 +77,8 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if isinstance(data_select, str): data_select = [data_select] - for item in data_select: - if item not in ('train', 'test', 'valid'): - raise TypeError('{} in data_select is not supported!'.format(item)) + if not set(data_select).issubset(set(('train', 'test', 'valid'))): + raise TypeError('data_select is not supported!') if dataset_name == 'PennTreebank': extracted_files = [] From 8bb1cb2f21bb5ea725d532856e3556d8e58c3f9e Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Fri, 22 Nov 2019 13:25:14 -0800 Subject: [PATCH 35/47] Error if dataset is empty. --- torchtext/datasets/language_modeling.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index 694417fafe..a61c9a493d 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -112,8 +112,12 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), for tokens in _iter: data[item] += [token_id for token_id in tokens] + for key in data_select: + if data[key] == []: + raise TypeError('Dataset {} is empty!'.format(key)) + return tuple(LanguageModelingDataset(torch.tensor(data[d]).long(), vocab) - for d in data_select if data[d] != []) + for d in data_select) def WikiText2(*args, **kwargs): From 6a50f2a7cc0b166cd280b6087d188845c441467b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 07:27:02 -0800 Subject: [PATCH 36/47] filter output is iterable. --- torchtext/data/functional.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 4abce321b5..3795a57cb6 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -201,5 +201,5 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): if removed_tokens is None: yield iter(vocab[token] for token in tokens) else: - tokens = list(filter(lambda x: x not in removed_tokens, tokens)) - yield iter(vocab[token] for token in tokens) + yield iter(vocab[token] for token in + filter(lambda x: x not in removed_tokens, tokens)) From a29f4bd526b8e443a5a29be2ab7254afd328b74b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 07:55:02 -0800 Subject: [PATCH 37/47] flake8 --- torchtext/data/functional.py | 1 - 1 file changed, 1 deletion(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 3795a57cb6..66f70348e9 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -1,7 +1,6 @@ import sentencepiece as spm import re from torchtext.utils import unicode_csv_reader -import logging import io __all__ = [ From 9206e63d270bb5cae040904281c23d2c29e2ad0a Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 11:18:24 -0800 Subject: [PATCH 38/47] Add a claimer in README.rst --- README.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/README.rst b/README.rst index 6169e26a04..430b2e054a 100644 --- a/README.rst +++ b/README.rst @@ -129,6 +129,16 @@ Others are planned or a work in progress: See the ``test`` directory for examples of dataset usage. +Legacy Code +=========== + +We are currently retiring several datasets as legacy code ```torchtext.legacy```: + +* Sentiment analysis: IMDb +* Language modeling: abstract class + WikiText-2, WikiText103, PennTreebank + +These datasets are re-written with a new pattern that is introduced in `Release v0.5.0 `_. + Disclaimer on Datasets ====================== From e2ba8bf72fc0548bc4ec8c28113748cc0da03d14 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 11:45:43 -0800 Subject: [PATCH 39/47] revise create_data_from_iterator --- torchtext/data/functional.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 66f70348e9..3b30e33067 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -195,10 +195,9 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): >>> [0, 2, 3] >>> [2, 3] """ - + print("new iterator data") for tokens in iterator: if removed_tokens is None: yield iter(vocab[token] for token in tokens) else: - yield iter(vocab[token] for token in - filter(lambda x: x not in removed_tokens, tokens)) + yield iter(map(vocab, filter(lambda x: x not in removed_tokens, tokens))) From 099354091eccbfb10fe1351e13de9cfac72b409b Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 11:49:12 -0800 Subject: [PATCH 40/47] Remove a printout. --- torchtext/data/functional.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 3b30e33067..34584ffb30 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -195,9 +195,9 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): >>> [0, 2, 3] >>> [2, 3] """ - print("new iterator data") for tokens in iterator: if removed_tokens is None: yield iter(vocab[token] for token in tokens) else: - yield iter(map(vocab, filter(lambda x: x not in removed_tokens, tokens))) + yield iter(map(lambda x: vocab[x], + filter(lambda x: x not in removed_tokens, tokens))) From 81055a077d51b4cd06bb2c66e3f7566013008322 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 13:15:47 -0800 Subject: [PATCH 41/47] Remove version num in legacy. --- torchtext/legacy/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/torchtext/legacy/__init__.py b/torchtext/legacy/__init__.py index ab3e4d382f..d7fa116bab 100644 --- a/torchtext/legacy/__init__.py +++ b/torchtext/legacy/__init__.py @@ -1,5 +1,3 @@ from . import datasets -__version__ = '0.4.0' - __all__ = ['datasets'] From 9dc475246d84ef210837512fd7af7713b9575f32 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 16:08:40 -0800 Subject: [PATCH 42/47] remove read_text_iterator func --- torchtext/data/__init__.py | 4 ++-- torchtext/data/functional.py | 21 --------------------- torchtext/datasets/language_modeling.py | 11 ++++++++--- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/torchtext/data/__init__.py b/torchtext/data/__init__.py index 5775326a13..331dfc679f 100644 --- a/torchtext/data/__init__.py +++ b/torchtext/data/__init__.py @@ -11,7 +11,7 @@ load_sp_model, \ sentencepiece_numericalizer, \ sentencepiece_tokenizer, custom_replace, simple_space_split, \ - read_text_iterator, create_data_from_iterator + create_data_from_iterator __all__ = ["Batch", "Dataset", "TabularDataset", @@ -26,4 +26,4 @@ "generate_sp_model", "load_sp_model", "sentencepiece_numericalizer", "sentencepiece_tokenizer", "custom_replace", "simple_space_split", - "read_text_iterator", "create_data_from_iterator"] + "create_data_from_iterator"] diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 34584ffb30..7dadcd18e7 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -154,27 +154,6 @@ def simple_space_split(iterator): yield line.split() -def read_text_iterator(path, tokenizer): - r"""Read text from path and yield a list of tokens based on the tokenizer - - Arguments: - path: the file path. - tokenizer: the tokenizer used to tokenize string text. - - Examples: - >>> from torchtext.data.functional import read_text_iterator - >>> tokenizer = get_tokenizer("basic_english") - >>> list((read_text_iterator('.data/ptb.train.txt', tokenizer))) - [['Sentencepiece', 'encode', 'as', 'pieces'], ['example', 'to', 'try!']] - """ - - with io.open(path, encoding="utf8") as f: - reader = unicode_csv_reader(f) - for row in reader: - tokens = tokenizer(' '.join(row)) - yield tokens - - def create_data_from_iterator(vocab, iterator, removed_tokens=None): r"""Yield a list of ids from an token iterator with a vocab. diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index a61c9a493d..d0fd1d082d 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -1,11 +1,12 @@ import torch import logging import os +import io from torchtext.utils import download_from_url, extract_archive from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer from torchtext.vocab import Vocab -from torchtext.data.functional import read_text_iterator, create_data_from_iterator +from torchtext.data.functional import create_data_from_iterator URLS = { 'WikiText2': @@ -97,7 +98,9 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), if 'train' not in _path.keys(): raise TypeError("Must pass a vocab if train is not selected.") logging.info('Building Vocab based on {}'.format(_path['train'])) - vocab = build_vocab_from_iterator(read_text_iterator(_path['train'], tokenizer)) + txt_iter = iter(tokenizer(row) for row in io.open(_path['train'], + encoding="utf8")) + vocab = build_vocab_from_iterator(txt_iter) logging.info('Vocab has {} entries'.format(len(vocab))) else: if not isinstance(vocab, Vocab): @@ -107,8 +110,10 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), for item in _path.keys(): data[item] = [] logging.info('Creating {} data'.format(item)) + txt_iter = iter(tokenizer(row) for row in io.open(_path[item], + encoding="utf8")) _iter = create_data_from_iterator( - vocab, read_text_iterator(_path[item], tokenizer), removed_tokens) + vocab, txt_iter, removed_tokens) for tokens in _iter: data[item] += [token_id for token_id in tokens] From 367a340ef05e6b4b7cf42eb61c89380cb40ec501 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 16:10:11 -0800 Subject: [PATCH 43/47] Update README. --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 430b2e054a..df6dc9a29c 100644 --- a/README.rst +++ b/README.rst @@ -132,7 +132,7 @@ See the ``test`` directory for examples of dataset usage. Legacy Code =========== -We are currently retiring several datasets as legacy code ```torchtext.legacy```: +We have currently retired several datasets and moved them under ```torchtext.legacy```: * Sentiment analysis: IMDb * Language modeling: abstract class + WikiText-2, WikiText103, PennTreebank From b54b883c218efc9dcbb657eba2f8d5dce1f4c73a Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 16:18:33 -0800 Subject: [PATCH 44/47] Update the test case after not using read_text_iterator --- test/data/test_builtin_datasets.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 7b67f68652..989869cb2b 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -37,9 +37,9 @@ def test_wikitext2(self): from torchtext.datasets import WikiText2 # smoke test to ensure wikitext2 works properly train_dataset, test_dataset, valid_dataset = WikiText2() - self.assertEqual(len(train_dataset), 1947375) - self.assertEqual(len(test_dataset), 230357) - self.assertEqual(len(valid_dataset), 203947) + self.assertEqual(len(train_dataset), 2049990) + self.assertEqual(len(test_dataset), 241859) + self.assertEqual(len(valid_dataset), 214417) vocab = train_dataset.get_vocab() tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] From 1478d13c71b4cbbe2568e83740c82378872e5486 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 16:35:17 -0800 Subject: [PATCH 45/47] rename to numericalize_tokens_from_iterator --- torchtext/data/__init__.py | 4 ++-- torchtext/data/functional.py | 6 +++--- torchtext/datasets/language_modeling.py | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/torchtext/data/__init__.py b/torchtext/data/__init__.py index 331dfc679f..20ca10a7a3 100644 --- a/torchtext/data/__init__.py +++ b/torchtext/data/__init__.py @@ -11,7 +11,7 @@ load_sp_model, \ sentencepiece_numericalizer, \ sentencepiece_tokenizer, custom_replace, simple_space_split, \ - create_data_from_iterator + numericalize_tokens_from_iterator __all__ = ["Batch", "Dataset", "TabularDataset", @@ -26,4 +26,4 @@ "generate_sp_model", "load_sp_model", "sentencepiece_numericalizer", "sentencepiece_tokenizer", "custom_replace", "simple_space_split", - "create_data_from_iterator"] + "numericalize_tokens_from_iterator"] diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index 7dadcd18e7..b18722a13e 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -154,7 +154,7 @@ def simple_space_split(iterator): yield line.split() -def create_data_from_iterator(vocab, iterator, removed_tokens=None): +def numericalize_tokens_from_iterator(vocab, iterator, removed_tokens=None): r"""Yield a list of ids from an token iterator with a vocab. Arguments: @@ -164,9 +164,9 @@ def create_data_from_iterator(vocab, iterator, removed_tokens=None): Examples: >>> from torchtext.data.functional import simple_space_split - >>> from torchtext.data.functional import create_data_from_iterator + >>> from torchtext.data.functional import numericalize_tokens_from_iterator >>> vocab = {'Sentencepiece' : 0, 'encode' : 1, 'as' : 2, 'pieces' : 3} - >>> ids_iter = create_data_from_iterator(vocab, + >>> ids_iter = numericalize_tokens_from_iterator(vocab, >>> simple_space_split(["Sentencepiece as pieces", >>> "as pieces"])) >>> for ids in ids_iter: diff --git a/torchtext/datasets/language_modeling.py b/torchtext/datasets/language_modeling.py index d0fd1d082d..04c7f5c00d 100644 --- a/torchtext/datasets/language_modeling.py +++ b/torchtext/datasets/language_modeling.py @@ -6,7 +6,7 @@ from torchtext.vocab import build_vocab_from_iterator from torchtext.data.utils import get_tokenizer from torchtext.vocab import Vocab -from torchtext.data.functional import create_data_from_iterator +from torchtext.data.functional import numericalize_tokens_from_iterator URLS = { 'WikiText2': @@ -112,7 +112,7 @@ def _setup_datasets(dataset_name, tokenizer=get_tokenizer("basic_english"), logging.info('Creating {} data'.format(item)) txt_iter = iter(tokenizer(row) for row in io.open(_path[item], encoding="utf8")) - _iter = create_data_from_iterator( + _iter = numericalize_tokens_from_iterator( vocab, txt_iter, removed_tokens) for tokens in _iter: data[item] += [token_id for token_id in tokens] From cf7c1887ac9b6e62bd6d0bf97ea35cd35075cb36 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 16:50:32 -0800 Subject: [PATCH 46/47] flake8 --- test/data/test_builtin_datasets.py | 2 +- torchtext/data/functional.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 989869cb2b..a806714299 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -43,7 +43,7 @@ def test_wikitext2(self): vocab = train_dataset.get_vocab() tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] - self.assertEqual(tokens_ids, [2, 285, 502, 699]) + self.assertEqual(tokens_ids, [[2, 286, 503, 700]]) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "wikitext-2") diff --git a/torchtext/data/functional.py b/torchtext/data/functional.py index b18722a13e..3aad8aa7fc 100644 --- a/torchtext/data/functional.py +++ b/torchtext/data/functional.py @@ -1,7 +1,5 @@ import sentencepiece as spm import re -from torchtext.utils import unicode_csv_reader -import io __all__ = [ "generate_sp_model", "load_sp_model", From 03dfc278095a2afe7c69fb79a689f34ba85e5c02 Mon Sep 17 00:00:00 2001 From: Guanheng Zhang Date: Mon, 25 Nov 2019 17:46:11 -0800 Subject: [PATCH 47/47] minor --- test/data/test_builtin_datasets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index a806714299..3e1102ef44 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -43,7 +43,7 @@ def test_wikitext2(self): vocab = train_dataset.get_vocab() tokens_ids = [vocab[token] for token in 'the player characters rest'.split()] - self.assertEqual(tokens_ids, [[2, 286, 503, 700]]) + self.assertEqual(tokens_ids, [2, 286, 503, 700]) # Delete the dataset after we're done to save disk space on CI datafile = os.path.join(self.project_root, ".data", "wikitext-2")