diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index d7a02995d7..95f2894634 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -138,11 +138,18 @@ def test_text_classification(self): self._helper_test_func(len(test_iter), 7600, next(iter(test_iter))[1][:25], 'Fears for T N pension aft') del train_iter, test_iter - def test_num_lines_of_setup_iter_dataset(self): - train_iter, test_iter = torchtext.experimental.datasets.raw.AG_NEWS() - train_iter.setup_iter(start=10, num_lines=100) + def test_num_lines_of_dataset(self): + train_iter, test_iter = torchtext.experimental.datasets.raw.AG_NEWS(offset=10) _data = [item for item in train_iter] - self.assertEqual(len(_data), 100) + self.assertEqual(len(_data), 119990) + + def test_offset_dataset(self): + train_iter, test_iter = torchtext.experimental.datasets.raw.AG_NEWS(split=('train', 'test'), + offset=10) + container = [text[:20] for idx, (label, text) in enumerate(train_iter) if idx < 5] + self.assertEqual(container, ['Oil and Economy Clou', 'No Need for OPEC to ', + 'Non-OPEC Nations Sho', 'Google IPO Auction O', + 'Dollar Falls Broadly']) def test_imdb(self): from torchtext.experimental.datasets import IMDB diff --git a/torchtext/experimental/datasets/raw/common.py b/torchtext/experimental/datasets/raw/common.py index 06415830c3..1267b703ff 100644 --- a/torchtext/experimental/datasets/raw/common.py +++ b/torchtext/experimental/datasets/raw/common.py @@ -14,29 +14,17 @@ class RawTextIterableDataset(torch.utils.data.IterableDataset): """Defines an abstraction for raw text iterable datasets. """ - def __init__(self, name, full_num_lines, iterator): + def __init__(self, name, full_num_lines, iterator, offset=0): """Initiate text-classification dataset. """ super(RawTextIterableDataset, self).__init__() self.name = name self.full_num_lines = full_num_lines self._iterator = iterator - self.has_setup = False - self.start = 0 - self.num_lines = None - - def setup_iter(self, start=0, num_lines=None): - self.start = start - self.num_lines = num_lines - if num_lines and self.start + self.num_lines > self.full_num_lines: - raise ValueError("Requested start {} and num_lines {} exceeds available number of lines {}".format( - self.start, self.num_lines, self.full_num_lines)) - self.has_setup = True + self.start = offset + self.num_lines = full_num_lines - offset def __iter__(self): - if not self.has_setup: - self.setup_iter() - for i, item in enumerate(self._iterator): if i < self.start: continue @@ -45,9 +33,7 @@ def __iter__(self): yield item def __len__(self): - if self.has_setup: - return self.num_lines - return self.full_num_lines + return self.num_lines def get_iterator(self): return self._iterator diff --git a/torchtext/experimental/datasets/raw/language_modeling.py b/torchtext/experimental/datasets/raw/language_modeling.py index 9edb6e735c..c9bf90f216 100644 --- a/torchtext/experimental/datasets/raw/language_modeling.py +++ b/torchtext/experimental/datasets/raw/language_modeling.py @@ -17,7 +17,7 @@ } -def _setup_datasets(dataset_name, root, split, year, language): +def _setup_datasets(dataset_name, root, split, year, language, offset): split = check_default_set(split, ('train', 'test', 'valid')) if isinstance(split, str): split = [split] @@ -55,10 +55,10 @@ def _setup_datasets(dataset_name, root, split, year, language): data[item] = iter(io.open(_path[item], encoding="utf8")) return tuple(RawTextIterableDataset(dataset_name, - NUM_LINES[dataset_name][item], data[item]) for item in split) + NUM_LINES[dataset_name][item], data[item], offset=offset) for item in split) -def WikiText2(root='.data', split=('train', 'valid', 'test')): +def WikiText2(root='.data', split=('train', 'valid', 'test'), offset=0): """ Defines WikiText2 datasets. Create language modeling dataset: WikiText2 @@ -72,6 +72,7 @@ def WikiText2(root='.data', split=('train', 'valid', 'test')): just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.raw.datasets import WikiText2 @@ -80,10 +81,10 @@ def WikiText2(root='.data', split=('train', 'valid', 'test')): """ - return _setup_datasets("WikiText2", root, split, None, None) + return _setup_datasets("WikiText2", root, split, None, None, offset) -def WikiText103(root='.data', split=('train', 'valid', 'test')): +def WikiText103(root='.data', split=('train', 'valid', 'test'), offset=0): """ Defines WikiText103 datasets. Create language modeling dataset: WikiText103 @@ -96,6 +97,7 @@ def WikiText103(root='.data', split=('train', 'valid', 'test')): could also choose any one or two of them, for example ('train', 'test'). If 'train' is not in the tuple, an vocab object should be provided which will be used to process valid and/or test data. + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import WikiText103 @@ -103,10 +105,10 @@ def WikiText103(root='.data', split=('train', 'valid', 'test')): >>> valid_dataset, = WikiText103(split='valid') """ - return _setup_datasets("WikiText103", root, split, None, None) + return _setup_datasets("WikiText103", root, split, None, None, offset) -def PennTreebank(root='.data', split=('train', 'valid', 'test')): +def PennTreebank(root='.data', split=('train', 'valid', 'test'), offset=0): """ Defines PennTreebank datasets. Create language modeling dataset: PennTreebank @@ -121,6 +123,7 @@ def PennTreebank(root='.data', split=('train', 'valid', 'test')): just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import PennTreebank @@ -129,10 +132,10 @@ def PennTreebank(root='.data', split=('train', 'valid', 'test')): """ - return _setup_datasets("PennTreebank", root, split, None, None) + return _setup_datasets("PennTreebank", root, split, None, None, offset) -def WMTNewsCrawl(root='.data', split=('train'), year=2010, language='en'): +def WMTNewsCrawl(root='.data', split=('train'), year=2010, language='en', offset=0): """ Defines WMT News Crawl. Create language modeling dataset: WMTNewsCrawl @@ -143,11 +146,12 @@ def WMTNewsCrawl(root='.data', split=('train'), year=2010, language='en'): (Default: 'train') year: the year of the dataset (Default: 2010) language: the language of the dataset (Default: 'en') + offset: the number of the starting line. Default: 0 Note: WMTNewsCrawl provides datasets based on the year and language instead of train/valid/test. """ - return _setup_datasets("WMTNewsCrawl", root, split, year, language) + return _setup_datasets("WMTNewsCrawl", root, split, year, language, offset) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/question_answer.py b/torchtext/experimental/datasets/raw/question_answer.py index 3bc4d83461..937e58af24 100644 --- a/torchtext/experimental/datasets/raw/question_answer.py +++ b/torchtext/experimental/datasets/raw/question_answer.py @@ -29,15 +29,15 @@ def _create_data_from_json(data_path): yield (_context, _question, _answers, _answer_start) -def _setup_datasets(dataset_name, root, split): +def _setup_datasets(dataset_name, root, split, offset): split = check_default_set(split, ('train', 'dev')) extracted_files = {key: download_from_url(URLS[dataset_name][key], root=root, hash_value=MD5[dataset_name][key], hash_type='md5') for key in split} return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], - _create_data_from_json(extracted_files[item])) for item in split) + _create_data_from_json(extracted_files[item]), offset=offset) for item in split) -def SQuAD1(root='.data', split=('train', 'dev')): +def SQuAD1(root='.data', split=('train', 'dev'), offset=0): """ A dataset iterator yields the data of Stanford Question Answering dataset - SQuAD1.0. The iterator yields a tuple of (raw context, raw question, a list of raw answer, a list of answer positions in the raw context). @@ -51,6 +51,7 @@ def SQuAD1(root='.data', split=('train', 'dev')): split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, both datasets (train, dev) are generated. Users could also choose any one or two of them, for example ('train', 'dev') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train_dataset, dev_dataset = torchtext.experimental.datasets.raw.SQuAD1() @@ -58,10 +59,10 @@ def SQuAD1(root='.data', split=('train', 'dev')): >>> print(idx, (context, question, answer, ans_pos)) """ - return _setup_datasets("SQuAD1", root, split) + return _setup_datasets("SQuAD1", root, split, offset) -def SQuAD2(root='.data', split=('train', 'dev')): +def SQuAD2(root='.data', split=('train', 'dev'), offset=0): """ A dataset iterator yields the data of Stanford Question Answering dataset - SQuAD2.0. The iterator yields a tuple of (raw context, raw question, a list of raw answer, a list of answer positions in the raw context). @@ -75,6 +76,7 @@ def SQuAD2(root='.data', split=('train', 'dev')): split: a string or tuple for the returned datasets (Default: ('train', 'dev')) By default, both datasets (train, dev) are generated. Users could also choose any one or two of them, for example ('train', 'dev') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train_dataset, dev_dataset = torchtext.experimental.datasets.raw.SQuAD2() @@ -82,7 +84,7 @@ def SQuAD2(root='.data', split=('train', 'dev')): >>> print(idx, (context, question, answer, ans_pos)) """ - return _setup_datasets("SQuAD2", root, split) + return _setup_datasets("SQuAD2", root, split, offset) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index b3f221a385..2c59946d69 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -39,7 +39,7 @@ def _construct_filepath(paths, file_suffix): return None -def _setup_datasets(dataset_name, separator, root, split): +def _setup_datasets(dataset_name, separator, root, split, offset): split = check_default_set(split, target_select=('train', 'valid', 'test')) extracted_files = [] if isinstance(URLS[dataset_name], dict): @@ -60,11 +60,11 @@ def _setup_datasets(dataset_name, separator, root, split): "test": _construct_filepath(extracted_files, "test.txt") } return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], - _create_data_from_iob(data_filenames[item], separator)) + _create_data_from_iob(data_filenames[item], separator), offset=offset) if data_filenames[item] is not None else None for item in split) -def UDPOS(root=".data", split=('train', 'valid', 'test')): +def UDPOS(root=".data", split=('train', 'valid', 'test'), offset=0): """ Universal Dependencies English Web Treebank Separately returns the training and test dataset @@ -75,15 +75,16 @@ def UDPOS(root=".data", split=('train', 'valid', 'test')): By default, all the datasets (train, valid, test) are generated. Users could also choose any one or two of them, for example ('train', 'valid', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import UDPOS >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ - return _setup_datasets("UDPOS", "\t", root, split) + return _setup_datasets("UDPOS", "\t", root, split, offset) -def CoNLL2000Chunking(root=".data", split=('train', 'test')): +def CoNLL2000Chunking(root=".data", split=('train', 'test'), offset=0): """ CoNLL 2000 Chunking Dataset Separately returns the training and test dataset @@ -93,12 +94,13 @@ def CoNLL2000Chunking(root=".data", split=('train', 'test')): split: a string or tuple for the returned datasets (Default: ('train', 'test')) By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import CoNLL2000Chunking >>> train_dataset, test_dataset = CoNLL2000Chunking() """ - return _setup_datasets("CoNLL2000Chunking", " ", root, split) + return _setup_datasets("CoNLL2000Chunking", " ", root, split, offset) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/text_classification.py b/torchtext/experimental/datasets/raw/text_classification.py index 4a1706714c..a4fa4dfa19 100644 --- a/torchtext/experimental/datasets/raw/text_classification.py +++ b/torchtext/experimental/datasets/raw/text_classification.py @@ -33,7 +33,7 @@ def _create_data_from_csv(data_path): yield int(row[0]), ' '.join(row[1:]) -def _setup_datasets(dataset_name, root, split): +def _setup_datasets(dataset_name, root, split, offset): split = check_default_set(split, target_select=('train', 'test')) if dataset_name == 'AG_NEWS': extracted_files = [download_from_url(URLS[dataset_name][item], root=root, @@ -51,10 +51,10 @@ def _setup_datasets(dataset_name, root, split): if fname.endswith('test.csv'): cvs_path['test'] = fname return tuple(RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][item], - _create_data_from_csv(cvs_path[item])) for item in split) + _create_data_from_csv(cvs_path[item]), offset=offset) for item in split) -def AG_NEWS(root='.data', split=('train', 'test')): +def AG_NEWS(root='.data', split=('train', 'test'), offset=0): """ Defines AG_NEWS datasets. Create supervised learning dataset: AG_NEWS @@ -66,15 +66,16 @@ def AG_NEWS(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.AG_NEWS() """ - return _setup_datasets("AG_NEWS", root, split) + return _setup_datasets("AG_NEWS", root, split, offset) -def SogouNews(root='.data', split=('train', 'test')): +def SogouNews(root='.data', split=('train', 'test'), offset=0): """ Defines SogouNews datasets. Create supervised learning dataset: SogouNews @@ -86,15 +87,16 @@ def SogouNews(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.SogouNews() """ - return _setup_datasets("SogouNews", root, split) + return _setup_datasets("SogouNews", root, split, offset) -def DBpedia(root='.data', split=('train', 'test')): +def DBpedia(root='.data', split=('train', 'test'), offset=0): """ Defines DBpedia datasets. Create supervised learning dataset: DBpedia @@ -106,15 +108,16 @@ def DBpedia(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.DBpedia() """ - return _setup_datasets("DBpedia", root, split) + return _setup_datasets("DBpedia", root, split, offset) -def YelpReviewPolarity(root='.data', split=('train', 'test')): +def YelpReviewPolarity(root='.data', split=('train', 'test'), offset=0): """ Defines YelpReviewPolarity datasets. Create supervised learning dataset: YelpReviewPolarity @@ -126,15 +129,16 @@ def YelpReviewPolarity(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.YelpReviewPolarity() """ - return _setup_datasets("YelpReviewPolarity", root, split) + return _setup_datasets("YelpReviewPolarity", root, split, offset) -def YelpReviewFull(root='.data', split=('train', 'test')): +def YelpReviewFull(root='.data', split=('train', 'test'), offset=0): """ Defines YelpReviewFull datasets. Create supervised learning dataset: YelpReviewFull @@ -146,15 +150,16 @@ def YelpReviewFull(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.YelpReviewFull() """ - return _setup_datasets("YelpReviewFull", root, split) + return _setup_datasets("YelpReviewFull", root, split, offset) -def YahooAnswers(root='.data', split=('train', 'test')): +def YahooAnswers(root='.data', split=('train', 'test'), offset=0): """ Defines YahooAnswers datasets. Create supervised learning dataset: YahooAnswers @@ -166,15 +171,16 @@ def YahooAnswers(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.YahooAnswers() """ - return _setup_datasets("YahooAnswers", root, split) + return _setup_datasets("YahooAnswers", root, split, offset) -def AmazonReviewPolarity(root='.data', split=('train', 'test')): +def AmazonReviewPolarity(root='.data', split=('train', 'test'), offset=0): """ Defines AmazonReviewPolarity datasets. Create supervised learning dataset: AmazonReviewPolarity @@ -186,15 +192,16 @@ def AmazonReviewPolarity(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.AmazonReviewPolarity() """ - return _setup_datasets("AmazonReviewPolarity", root, split) + return _setup_datasets("AmazonReviewPolarity", root, split, offset) -def AmazonReviewFull(root='.data', split=('train', 'test')): +def AmazonReviewFull(root='.data', split=('train', 'test'), offset=0): """ Defines AmazonReviewFull datasets. Create supervised learning dataset: AmazonReviewFull @@ -206,12 +213,13 @@ def AmazonReviewFull(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.AmazonReviewFull() """ - return _setup_datasets("AmazonReviewFull", root, split) + return _setup_datasets("AmazonReviewFull", root, split, offset) def generate_imdb_data(key, extracted_files): @@ -224,7 +232,7 @@ def generate_imdb_data(key, extracted_files): yield label, f.read() -def IMDB(root='.data', split=('train', 'test')): +def IMDB(root='.data', split=('train', 'test'), offset=0): """ Defines raw IMDB datasets. Create supervised learning dataset: IMDB @@ -236,6 +244,7 @@ def IMDB(root='.data', split=('train', 'test')): split: a string or tuple for the returned datasets. Default: ('train', 'test') By default, both datasets (train, test) are generated. Users could also choose any one or two of them, for example ('train', 'test') or just a string 'train'. + offset: the number of the starting line. Default: 0 Examples: >>> train, test = torchtext.experimental.datasets.raw.IMDB() @@ -246,7 +255,7 @@ def IMDB(root='.data', split=('train', 'test')): extracted_files = extract_archive(dataset_tar) return tuple(RawTextIterableDataset("IMDB", NUM_LINES["IMDB"][item], generate_imdb_data(item, - extracted_files)) for item in split) + extracted_files), offset=offset) for item in split) DATASETS = { diff --git a/torchtext/experimental/datasets/raw/translation.py b/torchtext/experimental/datasets/raw/translation.py index d4ca29a028..9cc10706c3 100644 --- a/torchtext/experimental/datasets/raw/translation.py +++ b/torchtext/experimental/datasets/raw/translation.py @@ -116,7 +116,7 @@ def _construct_filepaths(paths, src_filename, tgt_filename): def _setup_datasets(dataset_name, train_filenames, valid_filenames, test_filenames, - split, root): + split, root, offset): split = check_default_set(split, ('train', 'valid', 'test')) if not isinstance(train_filenames, tuple) and not isinstance(valid_filenames, tuple) \ and not isinstance(test_filenames, tuple): @@ -184,7 +184,7 @@ def _iter(src_data_iter, tgt_data_iter): yield item datasets.append( - RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter))) + RawTextIterableDataset(dataset_name, NUM_LINES[dataset_name][key], _iter(src_data_iter, tgt_data_iter), offset=offset)) return tuple(datasets) @@ -192,7 +192,7 @@ def _iter(src_data_iter, tgt_data_iter): def Multi30k(train_filenames=("train.de", "train.en"), valid_filenames=("val.de", "val.en"), test_filenames=("test_2016_flickr.de", "test_2016_flickr.en"), - split=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data', offset=0): """ Define translation datasets: Multi30k Separately returns train/valid/test datasets as a tuple The available dataset include: @@ -259,12 +259,13 @@ def Multi30k(train_filenames=("train.de", "train.en"), just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. root: Directory where the datasets are saved. Default: ".data" + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import Multi30k >>> train_dataset, valid_dataset, test_dataset = Multi30k() """ - return _setup_datasets("Multi30k", train_filenames, valid_filenames, test_filenames, split, root) + return _setup_datasets("Multi30k", train_filenames, valid_filenames, test_filenames, split, root, offset) def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), @@ -272,7 +273,7 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), 'IWSLT16.TED.tst2013.de-en.en'), test_filenames=('IWSLT16.TED.tst2014.de-en.de', 'IWSLT16.TED.tst2014.de-en.en'), - split=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data', offset=0): """ Define translation datasets: IWSLT Separately returns train/valid/test datasets The available datasets include: @@ -425,12 +426,13 @@ def IWSLT(train_filenames=('train.de-en.de', 'train.de-en.en'), just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. root: Directory where the datasets are saved. Default: ".data" + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import IWSLT >>> train_dataset, valid_dataset, test_dataset = IWSLT() """ - return _setup_datasets("IWSLT", train_filenames, valid_filenames, test_filenames, split, root) + return _setup_datasets("IWSLT", train_filenames, valid_filenames, test_filenames, split, root, offset) def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', @@ -439,7 +441,7 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', 'newstest2013.tok.bpe.32000.en'), test_filenames=('newstest2014.tok.bpe.32000.de', 'newstest2014.tok.bpe.32000.en'), - split=('train', 'valid', 'test'), root='.data'): + split=('train', 'valid', 'test'), root='.data', offset=0): """ Define translation datasets: WMT14 Separately returns train/valid/test datasets The available datasets include: @@ -507,12 +509,13 @@ def WMT14(train_filenames=('train.tok.clean.bpe.32000.de', just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. root: Directory where the datasets are saved. Default: ".data" + offset: the number of the starting line. Default: 0 Examples: >>> from torchtext.experimental.datasets.raw import WMT14 >>> train_dataset, valid_dataset, test_dataset = WMT14() """ - return _setup_datasets("WMT14", train_filenames, valid_filenames, test_filenames, split, root) + return _setup_datasets("WMT14", train_filenames, valid_filenames, test_filenames, split, root, offset) DATASETS = {