From b068312443dc1873c903cd5b25ee6b02ee60c8d1 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 3 Jun 2020 11:51:29 +0700 Subject: [PATCH 01/24] add raw for sequence tagging --- .../datasets/raw/sequence_tagging.py | 137 ++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100644 torchtext/experimental/datasets/raw/sequence_tagging.py diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py new file mode 100644 index 0000000000..8155a1dff1 --- /dev/null +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -0,0 +1,137 @@ +import torch + +from torchtext.utils import download_from_url, extract_archive + +URLS = { + "UDPOS": + 'https://bitbucket.org/sivareddyg/public/downloads/en-ud-v2.zip', + "CoNLL2000Chunking": [ + 'https://www.clips.uantwerpen.be/conll2000/chunking/train.txt.gz', + 'https://www.clips.uantwerpen.be/conll2000/chunking/test.txt.gz' + ] +} + + +def _create_data_from_iob(data_path, separator="\t"): + with open(data_path, encoding="utf-8") as input_file: + columns = [] + for line in input_file: + line = line.strip() + if line == "": + if columns: + yield columns + columns = [] + else: + for i, column in enumerate(line.split(separator)): + if len(columns) < i + 1: + columns.append([]) + columns[i].append(column) + + +def _setup_datasets(dataset_name, + train_filename, + valid_filename, + test_filename, + separator, + root=".data"): + + extracted_files = [] + if isinstance(URLS[dataset_name], list): + for f in URLS[dataset_name]: + dataset_tar = download_from_url(f, root=root) + extracted_files.extend(extract_archive(dataset_tar)) + elif isinstance(URLS[dataset_name], str): + dataset_tar = download_from_url(URLS[dataset_name], root=root) + extracted_files.extend(extract_archive(dataset_tar)) + else: + raise ValueError( + "URLS for {} has to be in a form or list or string".format( + dataset_name)) + + data_filenames = dict() + for fname in extracted_files: + if train_filename and train_filename in fname: + data_filenames["train"] = fname + else: + data_filenames["train"] = None + + if valid_filename and valid_filename in fname: + data_filenames["valid"] = fname + else: + data_filenames["valid"] = None + + if test_filename and test_filename in fname: + data_filenames["test"] = fname + else: + data_filenames["test"] = None + + datasets = [] + for key in data_filenames.keys(): + if data_filenames[key] is not None: + datasets.append( + RawSequenceTaggingIterableDataset( + _create_data_from_iob(data_filenames[key], separator))) + + return datasets + + +class RawSequenceTaggingIterableDataset(torch.utils.data.IterableDataset): + """Defines an abstraction for raw text sequence tagging iterable datasets. + """ + def __init__(self, iterator): + super(RawSequenceTaggingIterableDataset).__init__() + + self._iterator = iterator + self.has_setup = False + self.start = 0 + self.num_lines = None + + def setup_iter(self, start=0, num_lines=None): + self.start = start + self.num_lines = num_lines + self.has_setup = True + + def __iter__(self): + if not self.has_setup: + self.setup_iter() + + for i, item in enumerate(self._iterator): + if i >= self.start: + yield item + if (self.num_lines is not None) and (i == (self.start + + self.num_lines)): + break + + def get_iterator(self): + return self._iterator + + +def UDPOS(train_filename="en-ud-tag.v2.train.txt", + valid_filename="en-ud-tag.v2.dev.txt", + test_filename="en-ud-tag.v2.test.txt", + root=".data"): + """ Universal Dependencies English Web Treebank. + """ + return _setup_datasets(dataset_name="UDPOS", + root=root, + train_filename=train_filename, + valid_filename=valid_filename, + test_filename=test_filename, + separator="\t") + + +def CoNLL2000Chunking(train_filename="train.txt", + valid_filename=None, + test_filename="test.txt", + root=".data"): + """ CoNLL 2000 Chunking Dataset + """ + return _setup_datasets(dataset_name="CoNLL2000Chunking", + root=root, + train_filename=train_filename, + valid_filename=valid_filename, + test_filename=test_filename, + separator=' ') + + +DATASETS = {"UDPOS": UDPOS, "CoNLL2000Chunking": CoNLL2000Chunking} From 41b975f27210cf6e14c7c6aab13ebbe97008242a Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 3 Jun 2020 11:51:39 +0700 Subject: [PATCH 02/24] WIP sequence tagging dataset --- .../experimental/datasets/sequence_tagging.py | 113 ++++++++++++++++++ 1 file changed, 113 insertions(+) create mode 100644 torchtext/experimental/datasets/sequence_tagging.py diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py new file mode 100644 index 0000000000..d885992f0f --- /dev/null +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -0,0 +1,113 @@ +import torch + +from torchtext.experimental.datasets import raw +from torchtext.experimental.functional import ( + vocab_func, + totensor, + sequential_transforms, +) + +def _build_vocab(data): + + for line in data: + for col in line: + + + +def _setup_datasets( + dataset_name, + train_filename, + valid_filename, + test_filename, + separator, + data_select=("train", "valid", "test"), + root=".data", + vocab=(None, None), + tokenizer=None, +): + + text_transform = [] + if tokenizer is None: + tokenizer = get_tokenizer("basic_english") + train, val, test = DATASETS[dataset_name](train_filename=train_filename, + valid_filename=valid_filename, + test_filename=test_filename, + root=root) + raw_data = { + "train": [line for line in train], + "valid": [line for line in val] if val else None, + "test": [line for line in test] if test else None + } + + text_transform = sequential_transforms(tokenizer) + + + data_filenames = dict() + for fname in extracted_files: + if train_filename and train_filename in fname: + data_filenames["train"] = fname + if valid_filename and valid_filename in fname: + data_filenames["valid"] = fname + if test_filename and test_filename in fname: + data_filenames["test"] = fname + + datasets = [] + for key in data_filenames.keys(): + if data_filenames[key] is not None: + datasets.append( + RawSequenceTaggingIterableDataset( + _create_data_from_iob(data_filenames[key], separator))) + + return datasets + + +class SequenceTaggingDataset(torch.utils.data.Dataset): + """Defines an abstraction for raw text sequence tagging iterable datasets. + """ + def __init__(self, data, vocab, transforms): + super(SequenceTaggingDataset, self).__init__() + self.data = data + self.vocab = vocab + self.transforms = transforms + + def __getitem__(self, i): + line = [] + for idx, transform in enumerate(self.transforms): + line.append(transform(self.data[i][idx])) + return line + + def __len__(self): + return len(self.data) + + def get_vocab(self): + return self.vocab + + +def UDPOS(train_filename="en-ud-tag.v2.train.txt", + valid_filename="en-ud-tag.v2.dev.txt", + test_filename="en-ud-tag.v2.test.txt", + root=".data"): + """ Universal Dependencies English Web Treebank. + """ + return _setup_datasets(dataset_name="UDPOS", + root=root, + train_filename=train_filename, + valid_filename=valid_filename, + test_filename=test_filename, + separator="\t") + + +def CoNLL2000Chunking(train_filename="train.txt", + test_filename="test.txt", + root=".data"): + """ CoNLL 2000 Chunking Dataset + """ + return _setup_datasets(dataset_name="CoNLL2000Chunking", + root=root, + train_filename=train_filename, + valid_filename=None, + test_filename=test_filename, + separator=' ') + + +DATASETS = {"UDPOS": raw.UDPOS, "CoNLL2000Chunking": raw.CoNLL2000Chunking} From 49dec4b5072363711ce76f35b4f7d7210877f7a6 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 3 Jun 2020 14:00:15 +0700 Subject: [PATCH 03/24] add specialized function to handle None case --- .../datasets/raw/sequence_tagging.py | 32 +++++++++---------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index 8155a1dff1..94b9898393 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -28,6 +28,15 @@ def _create_data_from_iob(data_path, separator="\t"): columns[i].append(column) +def _construct_filepath(paths, filename): + if filename: + path = None + for p in paths: + path = p if filename in p else path + return path + return None + + def _setup_datasets(dataset_name, train_filename, valid_filename, @@ -48,22 +57,11 @@ def _setup_datasets(dataset_name, "URLS for {} has to be in a form or list or string".format( dataset_name)) - data_filenames = dict() - for fname in extracted_files: - if train_filename and train_filename in fname: - data_filenames["train"] = fname - else: - data_filenames["train"] = None - - if valid_filename and valid_filename in fname: - data_filenames["valid"] = fname - else: - data_filenames["valid"] = None - - if test_filename and test_filename in fname: - data_filenames["test"] = fname - else: - data_filenames["test"] = None + data_filenames = { + "train": _construct_filepath(extracted_files, train_filename), + "valid": _construct_filepath(extracted_files, valid_filename), + "test": _construct_filepath(extracted_files, test_filename) + } datasets = [] for key in data_filenames.keys(): @@ -71,6 +69,8 @@ def _setup_datasets(dataset_name, datasets.append( RawSequenceTaggingIterableDataset( _create_data_from_iob(data_filenames[key], separator))) + else: + datasets.append(None) return datasets From 3f988b142beeadf8fda7a217bfe50930acea3c59 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 3 Jun 2020 14:00:34 +0700 Subject: [PATCH 04/24] expose raw datasets for sequence tagging --- torchtext/experimental/datasets/raw/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/raw/__init__.py b/torchtext/experimental/datasets/raw/__init__.py index 61accbe2a1..53f5e7b507 100644 --- a/torchtext/experimental/datasets/raw/__init__.py +++ b/torchtext/experimental/datasets/raw/__init__.py @@ -1,6 +1,7 @@ from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ YelpReviewFull, YahooAnswers, \ AmazonReviewPolarity, AmazonReviewFull, IMDB +from .sequence_tagging import UDPOS, CoNLL2000Chunking __all__ = ['IMDB', 'AG_NEWS', @@ -10,4 +11,6 @@ 'YelpReviewFull', 'YahooAnswers', 'AmazonReviewPolarity', - 'AmazonReviewFull'] + 'AmazonReviewFull', + 'UDPOS', + 'CoNLL2000Chunking'] From 27bbeb7513fee6c29e22085468a904f58a153213 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 3 Jun 2020 14:21:47 +0700 Subject: [PATCH 05/24] finalized sequence tagging dataset --- .../experimental/datasets/sequence_tagging.py | 124 ++++++++++++------ 1 file changed, 84 insertions(+), 40 deletions(-) diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index d885992f0f..846ba0a4c1 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -1,62 +1,94 @@ import torch from torchtext.experimental.datasets import raw +from torchtext.vocab import build_vocab_from_iterator from torchtext.experimental.functional import ( vocab_func, totensor, sequential_transforms, ) -def _build_vocab(data): - - for line in data: - for col in line: - - -def _setup_datasets( - dataset_name, - train_filename, - valid_filename, - test_filename, - separator, - data_select=("train", "valid", "test"), - root=".data", - vocab=(None, None), - tokenizer=None, -): +def _build_vocab(data, word_transform): + total_columns = len(data[0]) + data_list = [[] for _ in range(total_columns)] + vocabs = [] - text_transform = [] - if tokenizer is None: - tokenizer = get_tokenizer("basic_english") + for line in data: + for idx, col in enumerate(line): + if idx == 0: + col = word_transform(col) if word_transform else col + data_list[idx].append(col) + else: + data_list[idx].append(col) + + for it in data_list: + vocabs.append(build_vocab_from_iterator(it)) + + return vocabs + + +def _setup_datasets(dataset_name, + train_filename, + valid_filename, + test_filename, + separator, + data_select=("train", "valid", "test"), + root=".data", + vocabs=None, + word_tokenizer=None): train, val, test = DATASETS[dataset_name](train_filename=train_filename, valid_filename=valid_filename, test_filename=test_filename, root=root) raw_data = { - "train": [line for line in train], + "train": [line for line in train] if train else None, "valid": [line for line in val] if val else None, "test": [line for line in test] if test else None } - text_transform = sequential_transforms(tokenizer) - - - data_filenames = dict() - for fname in extracted_files: - if train_filename and train_filename in fname: - data_filenames["train"] = fname - if valid_filename and valid_filename in fname: - data_filenames["valid"] = fname - if test_filename and test_filename in fname: - data_filenames["test"] = fname + word_transform = None + if word_tokenizer: + word_transform = sequential_transforms(word_tokenizer) + + if vocabs is None: + if "train" not in data_select: + raise TypeError("Must pass a vocab if train is not selected.") + vocabs = _build_vocab(raw_data["train"], word_transform) + else: + if not isinstance(vocabs, list): + raise TypeError("vocabs must be an instance of list") + + # Find data that's not None + notnone_data = None + for key in raw_data.keys(): + if raw_data[key] is not None: + notnone_data = raw_data[key] + break + if len(vocabs) != len(notnone_data[0]): + raise ValueError( + "Number of vocabs must match the number of columns " + "in the data") + + if word_transform: + word_transform = sequential_transforms(word_transform, + vocab_func(vocabs[0]), + totensor(dtype=torch.long)) + else: + word_transform = sequential_transforms(vocab_func(vocabs[0]), + totensor(dtype=torch.long)) + labels_transforms = [ + sequential_transforms(vocab_func(vocabs[idx + 1]), + totensor(dtype=torch.long)) + for idx in range(len(vocabs) - 1) + ] + transformers = [word_transform, *labels_transforms] datasets = [] - for key in data_filenames.keys(): - if data_filenames[key] is not None: + for item in data_select: + if raw_data[item] is not None: datasets.append( - RawSequenceTaggingIterableDataset( - _create_data_from_iob(data_filenames[key], separator))) + SequenceTaggingDataset(raw_data[item], vocabs, transformers)) return datasets @@ -86,7 +118,10 @@ def get_vocab(self): def UDPOS(train_filename="en-ud-tag.v2.train.txt", valid_filename="en-ud-tag.v2.dev.txt", test_filename="en-ud-tag.v2.test.txt", - root=".data"): + data_select=("train", "valid", "test"), + root=".data", + vocabs=None, + word_tokenizer=None): """ Universal Dependencies English Web Treebank. """ return _setup_datasets(dataset_name="UDPOS", @@ -94,12 +129,18 @@ def UDPOS(train_filename="en-ud-tag.v2.train.txt", train_filename=train_filename, valid_filename=valid_filename, test_filename=test_filename, - separator="\t") + separator="\t", + data_select=data_select, + vocabs=vocabs, + word_tokenizer=word_tokenizer) def CoNLL2000Chunking(train_filename="train.txt", test_filename="test.txt", - root=".data"): + data_select=("train", "valid", "test"), + root=".data", + vocabs=None, + word_tokenizer=None): """ CoNLL 2000 Chunking Dataset """ return _setup_datasets(dataset_name="CoNLL2000Chunking", @@ -107,7 +148,10 @@ def CoNLL2000Chunking(train_filename="train.txt", train_filename=train_filename, valid_filename=None, test_filename=test_filename, - separator=' ') + separator=' ', + data_select=data_select, + vocabs=vocabs, + word_tokenizer=word_tokenizer) DATASETS = {"UDPOS": raw.UDPOS, "CoNLL2000Chunking": raw.CoNLL2000Chunking} From a99723f3df92a9815f9d524f5bae6a593de7a61b Mon Sep 17 00:00:00 2001 From: akurniawan Date: Thu, 4 Jun 2020 07:51:47 +0700 Subject: [PATCH 06/24] add documentation --- .../datasets/raw/sequence_tagging.py | 30 ++++++++- .../experimental/datasets/sequence_tagging.py | 65 ++++++++++++++++++- 2 files changed, 93 insertions(+), 2 deletions(-) diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index 94b9898393..76052451f8 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -110,7 +110,22 @@ def UDPOS(train_filename="en-ud-tag.v2.train.txt", valid_filename="en-ud-tag.v2.dev.txt", test_filename="en-ud-tag.v2.test.txt", root=".data"): - """ Universal Dependencies English Web Treebank. + """ Universal Dependencies English Web Treebank + + Separately returns the training and test dataset + + Arguments: + train_filename: Filename for training dataset. + Default: en-ud-tag.v2.train.txt + valid_filename: Filename for validation dataset. + Default: en-ud-tag.v2.dev.txt + test_filename: Filename for test dataset. + Default: en-ud-tag.v2.test.txt + root: Directory where the datasets are saved. Default: ".data" + + Examples: + >>> from torchtext.datasets.raw import UDPOS + >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ return _setup_datasets(dataset_name="UDPOS", root=root, @@ -125,6 +140,19 @@ def CoNLL2000Chunking(train_filename="train.txt", test_filename="test.txt", root=".data"): """ CoNLL 2000 Chunking Dataset + + Separately returns the training and test dataset + + Arguments: + train_filename: Filename for training dataset. + Default: train.txt + test_filename: Filename for test dataset. + Default: test.txt + root: Directory where the datasets are saved. Default: ".data" + + Examples: + >>> from torchtext.datasets.raw import CoNLL2000Chunking + >>> train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() """ return _setup_datasets(dataset_name="CoNLL2000Chunking", root=root, diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 846ba0a4c1..085c5d512c 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -95,8 +95,19 @@ def _setup_datasets(dataset_name, class SequenceTaggingDataset(torch.utils.data.Dataset): """Defines an abstraction for raw text sequence tagging iterable datasets. + Currently, we only support the following datasets: + - UDPOS + - CoNLL2000Chunking """ def __init__(self, data, vocab, transforms): + """Initiate sequence tagging dataset. + Arguments: + data: a list of word and its respective tags. Example: + [[word, POS, dep_parsing label, ...]] + vocab: Vocabulary object used for dataset. + transforms: a list of string transforms for words and tags. + """ + super(SequenceTaggingDataset, self).__init__() self.data = data self.vocab = vocab @@ -122,7 +133,34 @@ def UDPOS(train_filename="en-ud-tag.v2.train.txt", root=".data", vocabs=None, word_tokenizer=None): - """ Universal Dependencies English Web Treebank. + """ Universal Dependencies English Web Treebank + + Separately returns the training and test dataset + + Arguments: + train_filename: Filename for training dataset. + Default: en-ud-tag.v2.train.txt + valid_filename: Filename for validation dataset. + Default: en-ud-tag.v2.dev.txt + test_filename: Filename for test dataset. + Default: en-ud-tag.v2.test.txt + data_select: a string or tuple for the returned datasets + (Default: ('train', 'valid', 'test')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. + root: Directory where the datasets are saved. Default: ".data" + vocabs: A list of voabularies for each columns in the dataset. Must be in an + instance of List + Default: None + word_tokenizer: The tokenizer used to preprocess word column in raw text data + Default: None + + Examples: + >>> from torchtext.datasets.raw import UDPOS + >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ return _setup_datasets(dataset_name="UDPOS", root=root, @@ -142,6 +180,31 @@ def CoNLL2000Chunking(train_filename="train.txt", vocabs=None, word_tokenizer=None): """ CoNLL 2000 Chunking Dataset + + Separately returns the training and test dataset + + Arguments: + train_filename: Filename for training dataset. + Default: train.txt + test_filename: Filename for test dataset. + Default: test.txt + data_select: a string or tuple for the returned datasets + (Default: ('train', 'valid', 'test')) + By default, all the three datasets (train, test, valid) are generated. Users + could also choose any one or two of them, for example ('train', 'test') or + just a string 'train'. If 'train' is not in the tuple or string, a vocab + object should be provided which will be used to process valid and/or test + data. + root: Directory where the datasets are saved. Default: ".data" + vocabs: A list of voabularies for each columns in the dataset. Must be in an + instance of List + Default: None + word_tokenizer: The tokenizer used to preprocess word column in raw text data + Default: None + + Examples: + >>> from torchtext.datasets.raw import CoNLL2000Chunking + >>> train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() """ return _setup_datasets(dataset_name="CoNLL2000Chunking", root=root, From b7ba5b0d3177e2e705f2bbc330d9dea57494ea1b Mon Sep 17 00:00:00 2001 From: akurniawan Date: Thu, 4 Jun 2020 08:28:17 +0700 Subject: [PATCH 07/24] expose sequence tagging data --- torchtext/experimental/datasets/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/torchtext/experimental/datasets/__init__.py b/torchtext/experimental/datasets/__init__.py index ac2faa423b..a5f717f883 100644 --- a/torchtext/experimental/datasets/__init__.py +++ b/torchtext/experimental/datasets/__init__.py @@ -2,6 +2,7 @@ from .text_classification import AG_NEWS, SogouNews, DBpedia, YelpReviewPolarity, \ YelpReviewFull, YahooAnswers, \ AmazonReviewPolarity, AmazonReviewFull, IMDB +from .sequence_tagging import UDPOS, CoNLL2000Chunking __all__ = ['LanguageModelingDataset', 'WikiText2', @@ -15,4 +16,6 @@ 'YelpReviewFull', 'YahooAnswers', 'AmazonReviewPolarity', - 'AmazonReviewFull'] + 'AmazonReviewFull', + 'UDPOS', + 'CoNLL2000Chunking'] From 36d4652cbe7cf45865b40f67f889bf3620085658 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Thu, 4 Jun 2020 08:28:31 +0700 Subject: [PATCH 08/24] add unit test for sequence tagging --- test/data/test_builtin_datasets.py | 58 ++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 08c75292c4..193db51cb1 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -122,3 +122,61 @@ def test_imdb(self): old_vocab = train_dataset.get_vocab() new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) + + def test_sequence_tagging(self): + from torchtext.experimental.datasets import UDPOS + + # smoke test to ensure imdb works properly + train_dataset, valid_dataset, test_dataset = UDPOS() + self.assertEqual(len(train_dataset), 12543) + self.assertEqual(len(valid_dataset), 2002) + self.assertEqual(len(test_dataset), 2077) + assert_allclose(train_dataset[0][0][:10], + torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) + assert_allclose(train_dataset[0][1][:10], + torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) + assert_allclose(train_dataset[0][2][:10], + torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) + assert_allclose(train_dataset[-1][0][:10], + torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) + assert_allclose(train_dataset[-1][1][:10], + torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) + assert_allclose(train_dataset[-1][2][:10], + torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) + + assert_allclose(valid_dataset[0][0][:10], + torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) + assert_allclose(valid_dataset[0][1][:10], + torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) + assert_allclose(valid_dataset[0][2][:10], + torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) + assert_allclose(valid_dataset[-1][0][:10], + torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) + assert_allclose(valid_dataset[-1][1][:10], + torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) + assert_allclose(valid_dataset[-1][2][:10], + torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) + + assert_allclose(test_dataset[0][0][:10], + torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) + assert_allclose(test_dataset[0][1][:10], + torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) + assert_allclose(test_dataset[0][2][:10], + torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) + assert_allclose(test_dataset[-1][0][:10], + torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) + assert_allclose(test_dataset[-1][1][:10], + torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) + assert_allclose(test_dataset[-1][2][:10], + torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) + + # Assert vocabs + self.assertEqual(len(train_dataset.get_vocab()), 3) + self.assertEqual(len(train_dataset.get_vocab()[0]), 19674) + self.assertEqual(len(train_dataset.get_vocab()[1]), 19) + self.assertEqual(len(train_dataset.get_vocab()[2]), 52) + + # Assert token ids + word_vocab = train_dataset.get_vocab()[0] + tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] + self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) \ No newline at end of file From b79839c6b12b8b240d9e21cf93674f6716fdd20e Mon Sep 17 00:00:00 2001 From: akurniawan Date: Thu, 4 Jun 2020 08:32:00 +0700 Subject: [PATCH 09/24] fix linting --- test/data/test_builtin_datasets.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 193db51cb1..7b9987db6c 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -138,7 +138,7 @@ def test_sequence_tagging(self): assert_allclose(train_dataset[0][2][:10], torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) assert_allclose(train_dataset[-1][0][:10], - torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) + torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) assert_allclose(train_dataset[-1][1][:10], torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) assert_allclose(train_dataset[-1][2][:10], @@ -179,4 +179,4 @@ def test_sequence_tagging(self): # Assert token ids word_vocab = train_dataset.get_vocab()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] - self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) \ No newline at end of file + self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) From a03eec42b9ba23ffee01f4d14ba17901bb4d1904 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 08:50:16 +0700 Subject: [PATCH 10/24] remove filename arguments --- .../datasets/raw/sequence_tagging.py | 57 +++------ .../experimental/datasets/sequence_tagging.py | 108 ++++++------------ 2 files changed, 45 insertions(+), 120 deletions(-) diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index 76052451f8..0e3655be43 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -28,21 +28,16 @@ def _create_data_from_iob(data_path, separator="\t"): columns[i].append(column) -def _construct_filepath(paths, filename): - if filename: +def _construct_filepath(paths, file_suffix): + if file_suffix: path = None for p in paths: - path = p if filename in p else path + path = p if p.endswith(file_suffix) else path return path return None -def _setup_datasets(dataset_name, - train_filename, - valid_filename, - test_filename, - separator, - root=".data"): +def _setup_datasets(dataset_name, separator, root=".data"): extracted_files = [] if isinstance(URLS[dataset_name], list): @@ -58,9 +53,9 @@ def _setup_datasets(dataset_name, dataset_name)) data_filenames = { - "train": _construct_filepath(extracted_files, train_filename), - "valid": _construct_filepath(extracted_files, valid_filename), - "test": _construct_filepath(extracted_files, test_filename) + "train": _construct_filepath(extracted_files, "train.txt"), + "valid": _construct_filepath(extracted_files, "dev.txt"), + "test": _construct_filepath(extracted_files, "test.txt") } datasets = [] @@ -106,60 +101,34 @@ def get_iterator(self): return self._iterator -def UDPOS(train_filename="en-ud-tag.v2.train.txt", - valid_filename="en-ud-tag.v2.dev.txt", - test_filename="en-ud-tag.v2.test.txt", - root=".data"): +def UDPOS(*args, **kwargs): """ Universal Dependencies English Web Treebank Separately returns the training and test dataset Arguments: - train_filename: Filename for training dataset. - Default: en-ud-tag.v2.train.txt - valid_filename: Filename for validation dataset. - Default: en-ud-tag.v2.dev.txt - test_filename: Filename for test dataset. - Default: en-ud-tag.v2.test.txt root: Directory where the datasets are saved. Default: ".data" Examples: >>> from torchtext.datasets.raw import UDPOS >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ - return _setup_datasets(dataset_name="UDPOS", - root=root, - train_filename=train_filename, - valid_filename=valid_filename, - test_filename=test_filename, - separator="\t") - - -def CoNLL2000Chunking(train_filename="train.txt", - valid_filename=None, - test_filename="test.txt", - root=".data"): + return _setup_datasets(*(("UDPOS", "\t") + args), **kwargs) + + +def CoNLL2000Chunking(*args, **kwargs): """ CoNLL 2000 Chunking Dataset Separately returns the training and test dataset Arguments: - train_filename: Filename for training dataset. - Default: train.txt - test_filename: Filename for test dataset. - Default: test.txt root: Directory where the datasets are saved. Default: ".data" Examples: >>> from torchtext.datasets.raw import CoNLL2000Chunking >>> train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() """ - return _setup_datasets(dataset_name="CoNLL2000Chunking", - root=root, - train_filename=train_filename, - valid_filename=valid_filename, - test_filename=test_filename, - separator=' ') + return _setup_datasets(*(("CoNLL2000Chunking", " ") + args), **kwargs) DATASETS = {"UDPOS": UDPOS, "CoNLL2000Chunking": CoNLL2000Chunking} diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 085c5d512c..716ac3af2a 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -9,7 +9,7 @@ ) -def _build_vocab(data, word_transform): +def _build_vocab(data, text_transform): total_columns = len(data[0]) data_list = [[] for _ in range(total_columns)] vocabs = [] @@ -17,7 +17,7 @@ def _build_vocab(data, word_transform): for line in data: for idx, col in enumerate(line): if idx == 0: - col = word_transform(col) if word_transform else col + col = text_transform(col) if text_transform else col data_list[idx].append(col) else: data_list[idx].append(col) @@ -29,32 +29,25 @@ def _build_vocab(data, word_transform): def _setup_datasets(dataset_name, - train_filename, - valid_filename, - test_filename, - separator, - data_select=("train", "valid", "test"), root=".data", vocabs=None, - word_tokenizer=None): - train, val, test = DATASETS[dataset_name](train_filename=train_filename, - valid_filename=valid_filename, - test_filename=test_filename, - root=root) + tokenizer=None, + data_select=("train", "valid", "test")): + train, val, test = DATASETS[dataset_name](root=root) raw_data = { "train": [line for line in train] if train else None, "valid": [line for line in val] if val else None, "test": [line for line in test] if test else None } - word_transform = None - if word_tokenizer: - word_transform = sequential_transforms(word_tokenizer) + text_transform = None + if tokenizer: + text_transform = sequential_transforms(tokenizer) if vocabs is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") - vocabs = _build_vocab(raw_data["train"], word_transform) + vocabs = _build_vocab(raw_data["train"], text_transform) else: if not isinstance(vocabs, list): raise TypeError("vocabs must be an instance of list") @@ -70,19 +63,19 @@ def _setup_datasets(dataset_name, "Number of vocabs must match the number of columns " "in the data") - if word_transform: - word_transform = sequential_transforms(word_transform, + if text_transform: + text_transform = sequential_transforms(text_transform, vocab_func(vocabs[0]), totensor(dtype=torch.long)) else: - word_transform = sequential_transforms(vocab_func(vocabs[0]), + text_transform = sequential_transforms(vocab_func(vocabs[0]), totensor(dtype=torch.long)) labels_transforms = [ sequential_transforms(vocab_func(vocabs[idx + 1]), totensor(dtype=torch.long)) for idx in range(len(vocabs) - 1) ] - transformers = [word_transform, *labels_transforms] + transformers = [text_transform, *labels_transforms] datasets = [] for item in data_select: @@ -126,24 +119,18 @@ def get_vocab(self): return self.vocab -def UDPOS(train_filename="en-ud-tag.v2.train.txt", - valid_filename="en-ud-tag.v2.dev.txt", - test_filename="en-ud-tag.v2.test.txt", - data_select=("train", "valid", "test"), - root=".data", - vocabs=None, - word_tokenizer=None): +def UDPOS(*args, **kwargs): """ Universal Dependencies English Web Treebank - Separately returns the training and test dataset + Separately returns the training, validation, and test dataset Arguments: - train_filename: Filename for training dataset. - Default: en-ud-tag.v2.train.txt - valid_filename: Filename for validation dataset. - Default: en-ud-tag.v2.dev.txt - test_filename: Filename for test dataset. - Default: en-ud-tag.v2.test.txt + root: Directory where the datasets are saved. Default: ".data" + vocabs: A list of voabularies for each columns in the dataset. Must be in an + instance of List + Default: None + tokenizer: The tokenizer used to preprocess word column in raw text data + Default: None data_select: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the three datasets (train, test, valid) are generated. Users @@ -151,43 +138,26 @@ def UDPOS(train_filename="en-ud-tag.v2.train.txt", just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. - root: Directory where the datasets are saved. Default: ".data" - vocabs: A list of voabularies for each columns in the dataset. Must be in an - instance of List - Default: None - word_tokenizer: The tokenizer used to preprocess word column in raw text data - Default: None Examples: >>> from torchtext.datasets.raw import UDPOS >>> train_dataset, valid_dataset, test_dataset = UDPOS() """ - return _setup_datasets(dataset_name="UDPOS", - root=root, - train_filename=train_filename, - valid_filename=valid_filename, - test_filename=test_filename, - separator="\t", - data_select=data_select, - vocabs=vocabs, - word_tokenizer=word_tokenizer) - - -def CoNLL2000Chunking(train_filename="train.txt", - test_filename="test.txt", - data_select=("train", "valid", "test"), - root=".data", - vocabs=None, - word_tokenizer=None): + return _setup_datasets(*(("UDPOS", ) + args), **kwargs) + + +def CoNLL2000Chunking(*args, **kwargs): """ CoNLL 2000 Chunking Dataset Separately returns the training and test dataset Arguments: - train_filename: Filename for training dataset. - Default: train.txt - test_filename: Filename for test dataset. - Default: test.txt + root: Directory where the datasets are saved. Default: ".data" + vocabs: A list of voabularies for each columns in the dataset. Must be in an + instance of List + Default: None + tokenizer: The tokenizer used to preprocess word column in raw text data + Default: None data_select: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the three datasets (train, test, valid) are generated. Users @@ -195,26 +165,12 @@ def CoNLL2000Chunking(train_filename="train.txt", just a string 'train'. If 'train' is not in the tuple or string, a vocab object should be provided which will be used to process valid and/or test data. - root: Directory where the datasets are saved. Default: ".data" - vocabs: A list of voabularies for each columns in the dataset. Must be in an - instance of List - Default: None - word_tokenizer: The tokenizer used to preprocess word column in raw text data - Default: None Examples: >>> from torchtext.datasets.raw import CoNLL2000Chunking >>> train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() """ - return _setup_datasets(dataset_name="CoNLL2000Chunking", - root=root, - train_filename=train_filename, - valid_filename=None, - test_filename=test_filename, - separator=' ', - data_select=data_select, - vocabs=vocabs, - word_tokenizer=word_tokenizer) + return _setup_datasets(*(("CoNLL2000Chunking", ) + args), **kwargs) DATASETS = {"UDPOS": raw.UDPOS, "CoNLL2000Chunking": raw.CoNLL2000Chunking} From 481ea37e86133fed25121c18f93640dbdeab1f5a Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 08:50:34 +0700 Subject: [PATCH 11/24] [WIP] adding conll test --- test/data/test_builtin_datasets.py | 60 +++++++++++++++++++++++++++++- 1 file changed, 59 insertions(+), 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 7b9987db6c..c2a253b320 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -123,7 +123,7 @@ def test_imdb(self): new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) - def test_sequence_tagging(self): + def test_udpos_sequence_tagging(self): from torchtext.experimental.datasets import UDPOS # smoke test to ensure imdb works properly @@ -180,3 +180,61 @@ def test_sequence_tagging(self): word_vocab = train_dataset.get_vocab()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) + + def test_conll_sequence_tagging(self): + from torchtext.experimental.datasets import CoNLL2000Chunking + + # smoke test to ensure imdb works properly + train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() + self.assertEqual(len(train_dataset), 12543) + self.assertEqual(len(valid_dataset), 2002) + self.assertEqual(len(test_dataset), 2077) + assert_allclose(train_dataset[0][0][:10], + torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) + assert_allclose(train_dataset[0][1][:10], + torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) + assert_allclose(train_dataset[0][2][:10], + torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) + assert_allclose(train_dataset[-1][0][:10], + torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) + assert_allclose(train_dataset[-1][1][:10], + torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) + assert_allclose(train_dataset[-1][2][:10], + torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) + + assert_allclose(valid_dataset[0][0][:10], + torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) + assert_allclose(valid_dataset[0][1][:10], + torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) + assert_allclose(valid_dataset[0][2][:10], + torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) + assert_allclose(valid_dataset[-1][0][:10], + torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) + assert_allclose(valid_dataset[-1][1][:10], + torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) + assert_allclose(valid_dataset[-1][2][:10], + torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) + + assert_allclose(test_dataset[0][0][:10], + torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) + assert_allclose(test_dataset[0][1][:10], + torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) + assert_allclose(test_dataset[0][2][:10], + torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) + assert_allclose(test_dataset[-1][0][:10], + torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) + assert_allclose(test_dataset[-1][1][:10], + torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) + assert_allclose(test_dataset[-1][2][:10], + torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) + + # Assert vocabs + self.assertEqual(len(train_dataset.get_vocab()), 3) + self.assertEqual(len(train_dataset.get_vocab()[0]), 19674) + self.assertEqual(len(train_dataset.get_vocab()[1]), 19) + self.assertEqual(len(train_dataset.get_vocab()[2]), 52) + + # Assert token ids + word_vocab = train_dataset.get_vocab()[0] + tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] + self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) \ No newline at end of file From fb3f8f56eace9e78c41414f55f2c7faa10423e8a Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 09:01:58 +0700 Subject: [PATCH 12/24] move the test order with translation dataset and finalize conll testing --- test/data/test_builtin_datasets.py | 116 +++++++++++++---------------- 1 file changed, 52 insertions(+), 64 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 4e079c203b..6558acc744 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -125,6 +125,38 @@ def test_imdb(self): new_vocab = Vocab(counter=old_vocab.freqs, max_size=2500) new_train_data, new_test_data = IMDB(vocab=new_vocab) + def test_multi30k(self): + from torchtext.experimental.datasets.translation import Multi30k + # smoke test to ensure multi30k works properly + train_dataset, valid_dataset, test_dataset = Multi30k() + self.assertEqual(len(train_dataset), 29000) + self.assertEqual(len(valid_dataset), 1000) + self.assertEqual(len(test_dataset), 1014) + + de_vocab, en_vocab = train_dataset.get_vocab() + de_tokens_ids = [ + de_vocab[token] for token in + 'Zwei Männer verpacken Donuts in Kunststofffolie'.split() + ] + self.assertEqual(de_tokens_ids, [19, 29, 18703, 4448, 5, 6240]) + + en_tokens_ids = [ + en_vocab[token] for token in + 'Two young White males are outside near many bushes'.split() + ] + self.assertEqual(en_tokens_ids, + [17, 23, 1167, 806, 15, 55, 82, 334, 1337]) + + datafile = os.path.join(self.project_root, ".data", "train*") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "val*") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", "test*") + conditional_remove(datafile) + datafile = os.path.join(self.project_root, ".data", + "multi30k_task*.tar.gz") + conditional_remove(datafile) + def test_udpos_sequence_tagging(self): from torchtext.experimental.datasets import UDPOS @@ -187,87 +219,43 @@ def test_conll_sequence_tagging(self): from torchtext.experimental.datasets import CoNLL2000Chunking # smoke test to ensure imdb works properly - train_dataset, valid_dataset, test_dataset = CoNLL2000Chunking() - self.assertEqual(len(train_dataset), 12543) - self.assertEqual(len(valid_dataset), 2002) - self.assertEqual(len(test_dataset), 2077) + train_dataset, test_dataset = CoNLL2000Chunking() + self.assertEqual(len(train_dataset), 8936) + self.assertEqual(len(test_dataset), 2012) assert_allclose(train_dataset[0][0][:10], - torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) + torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long()) assert_allclose(train_dataset[0][1][:10], - torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) + torch.tensor([2, 3, 5, 2, 17, 12, 16, 15, 13, 5]).long()) assert_allclose(train_dataset[0][2][:10], - torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) + torch.tensor([3, 6, 3, 2, 5, 7, 7, 7, 7, 3]).long()) assert_allclose(train_dataset[-1][0][:10], - torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) + torch.tensor([85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502]).long()) assert_allclose(train_dataset[-1][1][:10], - torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) + torch.tensor([18, 17, 12, 19, 10, 6, 3, 3, 4, 4]).long()) assert_allclose(train_dataset[-1][2][:10], - torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) - - assert_allclose(valid_dataset[0][0][:10], - torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) - assert_allclose(valid_dataset[0][1][:10], - torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) - assert_allclose(valid_dataset[0][2][:10], - torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) - assert_allclose(valid_dataset[-1][0][:10], - torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) - assert_allclose(valid_dataset[-1][1][:10], - torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) - assert_allclose(valid_dataset[-1][2][:10], - torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) + torch.tensor([3, 5, 7, 7, 3, 2, 6, 6, 3, 2]).long()) assert_allclose(test_dataset[0][0][:10], - torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) + torch.tensor([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7]).long()) assert_allclose(test_dataset[0][1][:10], - torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) + torch.tensor([4, 4, 4, 23, 4, 2, 11, 18, 11, 5]).long()) assert_allclose(test_dataset[0][2][:10], - torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) + torch.tensor([3, 2, 2, 3, 2, 2, 5, 3, 5, 3]).long()) assert_allclose(test_dataset[-1][0][:10], - torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) + torch.tensor([51, 456, 560, 2, 11, 465, 2, 1413, 36, 60]).long()) assert_allclose(test_dataset[-1][1][:10], - torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) + torch.tensor([3, 4, 4, 8, 3, 2, 8, 4, 17, 16]).long()) assert_allclose(test_dataset[-1][2][:10], - torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) + torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long()) # Assert vocabs self.assertEqual(len(train_dataset.get_vocab()), 3) - self.assertEqual(len(train_dataset.get_vocab()[0]), 19674) - self.assertEqual(len(train_dataset.get_vocab()[1]), 19) - self.assertEqual(len(train_dataset.get_vocab()[2]), 52) + self.assertEqual(len(train_dataset.get_vocab()[0]), 19124) + self.assertEqual(len(train_dataset.get_vocab()[1]), 46) + self.assertEqual(len(train_dataset.get_vocab()[2]), 24) # Assert token ids word_vocab = train_dataset.get_vocab()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] - self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) - def test_multi30k(self): - from torchtext.experimental.datasets.translation import Multi30k - # smoke test to ensure multi30k works properly - train_dataset, valid_dataset, test_dataset = Multi30k() - self.assertEqual(len(train_dataset), 29000) - self.assertEqual(len(valid_dataset), 1000) - self.assertEqual(len(test_dataset), 1014) - - de_vocab, en_vocab = train_dataset.get_vocab() - de_tokens_ids = [ - de_vocab[token] for token in - 'Zwei Männer verpacken Donuts in Kunststofffolie'.split() - ] - self.assertEqual(de_tokens_ids, [19, 29, 18703, 4448, 5, 6240]) - - en_tokens_ids = [ - en_vocab[token] for token in - 'Two young White males are outside near many bushes'.split() - ] - self.assertEqual(en_tokens_ids, - [17, 23, 1167, 806, 15, 55, 82, 334, 1337]) - - datafile = os.path.join(self.project_root, ".data", "train*") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "val*") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", "test*") - conditional_remove(datafile) - datafile = os.path.join(self.project_root, ".data", - "multi30k_task*.tar.gz") - conditional_remove(datafile) + self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) + \ No newline at end of file From 9a84a8a106d5839be5b08d01dcdfd5e2c53cbe76 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 09:13:18 +0700 Subject: [PATCH 13/24] add doc string for sequence tagging dataset --- docs/source/experimental_datasets.rst | 36 +++++++++++++++++++++------ 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/docs/source/experimental_datasets.rst b/docs/source/experimental_datasets.rst index a16d5dabba..6ae17dc34d 100644 --- a/docs/source/experimental_datasets.rst +++ b/docs/source/experimental_datasets.rst @@ -34,9 +34,9 @@ IMDb ~~~~ .. autoclass:: IMDB - :members: __init__ - - + :members: __init__ + + Text Classification ^^^^^^^^^^^^^^^^^^^ @@ -109,8 +109,8 @@ AmazonReviewFull dataset is subclass of ``TextClassificationDataset`` class. .. autoclass:: AmazonReviewFull :members: __init__ - - + + Language Modeling ^^^^^^^^^^^^^^^^^ @@ -124,21 +124,21 @@ WikiText-2 ~~~~~~~~~~ .. autoclass:: WikiText2 - :members: __init__ + :members: __init__ WikiText103 ~~~~~~~~~~~ .. autoclass:: WikiText103 - :members: __init__ + :members: __init__ PennTreebank ~~~~~~~~~~~~ .. autoclass:: PennTreebank - :members: __init__ + :members: __init__ Machine Translation @@ -167,3 +167,23 @@ WMT14 .. autoclass:: WMT14 :members: __init__ + +Sequence Tagging +^^^^^^^^^^^^^^^^ + +Language modeling datasets are subclasses of ``SequenceTaggingDataset`` class. + +.. autoclass:: SequenceTaggingDataset + :members: __init__ + +UDPOS +~~~~~ + +.. autoclass:: UDPOS + :members: __init__ + +CoNLL2000Chunking +~~~~~ + +.. autoclass:: CoNLL2000Chunking + :members: __init__ From 9d11bc17f251b5911da09c5f5b5100dd99efd468 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 09:19:38 +0700 Subject: [PATCH 14/24] remove spaces at the end of the file --- test/data/test_builtin_datasets.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 6558acc744..1b7e435147 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -258,4 +258,3 @@ def test_conll_sequence_tagging(self): word_vocab = train_dataset.get_vocab()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) - \ No newline at end of file From b1c5ec46e01949cdc278ca1d40d73bc13bc091e8 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 10:01:11 +0700 Subject: [PATCH 15/24] reformat docstring --- docs/source/experimental_datasets.rst | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/source/experimental_datasets.rst b/docs/source/experimental_datasets.rst index 6ae17dc34d..a619e40df4 100644 --- a/docs/source/experimental_datasets.rst +++ b/docs/source/experimental_datasets.rst @@ -34,9 +34,9 @@ IMDb ~~~~ .. autoclass:: IMDB - :members: __init__ - - + :members: __init__ + + Text Classification ^^^^^^^^^^^^^^^^^^^ @@ -109,8 +109,8 @@ AmazonReviewFull dataset is subclass of ``TextClassificationDataset`` class. .. autoclass:: AmazonReviewFull :members: __init__ - - + + Language Modeling ^^^^^^^^^^^^^^^^^ @@ -124,21 +124,21 @@ WikiText-2 ~~~~~~~~~~ .. autoclass:: WikiText2 - :members: __init__ + :members: __init__ WikiText103 ~~~~~~~~~~~ .. autoclass:: WikiText103 - :members: __init__ + :members: __init__ PennTreebank ~~~~~~~~~~~~ .. autoclass:: PennTreebank - :members: __init__ + :members: __init__ Machine Translation @@ -187,3 +187,4 @@ CoNLL2000Chunking .. autoclass:: CoNLL2000Chunking :members: __init__ + From c38a0b8ae193f45e7723d19e945672cc5f14c281 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 5 Jun 2020 12:28:42 +0700 Subject: [PATCH 16/24] remove tokenizer --- .../experimental/datasets/sequence_tagging.py | 33 ++++--------------- 1 file changed, 6 insertions(+), 27 deletions(-) diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 716ac3af2a..5ca1b3a27d 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -9,18 +9,14 @@ ) -def _build_vocab(data, text_transform): +def _build_vocab(data): total_columns = len(data[0]) data_list = [[] for _ in range(total_columns)] vocabs = [] for line in data: for idx, col in enumerate(line): - if idx == 0: - col = text_transform(col) if text_transform else col - data_list[idx].append(col) - else: - data_list[idx].append(col) + data_list[idx].append(col) for it in data_list: vocabs.append(build_vocab_from_iterator(it)) @@ -31,7 +27,6 @@ def _build_vocab(data, text_transform): def _setup_datasets(dataset_name, root=".data", vocabs=None, - tokenizer=None, data_select=("train", "valid", "test")): train, val, test = DATASETS[dataset_name](root=root) raw_data = { @@ -40,14 +35,10 @@ def _setup_datasets(dataset_name, "test": [line for line in test] if test else None } - text_transform = None - if tokenizer: - text_transform = sequential_transforms(tokenizer) - if vocabs is None: if "train" not in data_select: raise TypeError("Must pass a vocab if train is not selected.") - vocabs = _build_vocab(raw_data["train"], text_transform) + vocabs = _build_vocab(raw_data["train"]) else: if not isinstance(vocabs, list): raise TypeError("vocabs must be an instance of list") @@ -63,19 +54,11 @@ def _setup_datasets(dataset_name, "Number of vocabs must match the number of columns " "in the data") - if text_transform: - text_transform = sequential_transforms(text_transform, - vocab_func(vocabs[0]), - totensor(dtype=torch.long)) - else: - text_transform = sequential_transforms(vocab_func(vocabs[0]), - totensor(dtype=torch.long)) - labels_transforms = [ - sequential_transforms(vocab_func(vocabs[idx + 1]), + transformers = [ + sequential_transforms(vocab_func(vocabs[idx]), totensor(dtype=torch.long)) - for idx in range(len(vocabs) - 1) + for idx in range(len(vocabs)) ] - transformers = [text_transform, *labels_transforms] datasets = [] for item in data_select: @@ -129,8 +112,6 @@ def UDPOS(*args, **kwargs): vocabs: A list of voabularies for each columns in the dataset. Must be in an instance of List Default: None - tokenizer: The tokenizer used to preprocess word column in raw text data - Default: None data_select: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the three datasets (train, test, valid) are generated. Users @@ -156,8 +137,6 @@ def CoNLL2000Chunking(*args, **kwargs): vocabs: A list of voabularies for each columns in the dataset. Must be in an instance of List Default: None - tokenizer: The tokenizer used to preprocess word column in raw text data - Default: None data_select: a string or tuple for the returned datasets (Default: ('train', 'valid', 'test')) By default, all the three datasets (train, test, valid) are generated. Users From 12d4482c2f29edf98fa14ed35cb3ec750b52c5e4 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 12 Jun 2020 10:35:49 +0700 Subject: [PATCH 17/24] fix linting --- test/data/test_builtin_datasets.py | 1 + 1 file changed, 1 insertion(+) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 96582d71cc..3103794a66 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -258,6 +258,7 @@ def test_conll_sequence_tagging(self): word_vocab = train_dataset.get_vocab()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) + def test_squad1(self): from torchtext.experimental.datasets import SQuAD1 from torchtext.vocab import Vocab From 0aad1c4bd3420113595b0f1cc718c580bf9f7a83 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Mon, 15 Jun 2020 09:37:50 +0700 Subject: [PATCH 18/24] add cases where we don't have blank by the end of the file --- torchtext/experimental/datasets/raw/sequence_tagging.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/torchtext/experimental/datasets/raw/sequence_tagging.py b/torchtext/experimental/datasets/raw/sequence_tagging.py index 0e3655be43..b4576e7f81 100644 --- a/torchtext/experimental/datasets/raw/sequence_tagging.py +++ b/torchtext/experimental/datasets/raw/sequence_tagging.py @@ -26,6 +26,8 @@ def _create_data_from_iob(data_path, separator="\t"): if len(columns) < i + 1: columns.append([]) columns[i].append(column) + if len(columns) > 0: + yield columns def _construct_filepath(paths, file_suffix): From b662081151f55e4b4c1e763c693074060e2ebae0 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Mon, 15 Jun 2020 09:47:57 +0700 Subject: [PATCH 19/24] - add validation for data_select - add validation while getting current data in dataset class --- .../experimental/datasets/sequence_tagging.py | 36 ++++++++++++++----- 1 file changed, 27 insertions(+), 9 deletions(-) diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 5ca1b3a27d..38395facea 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -28,6 +28,11 @@ def _setup_datasets(dataset_name, root=".data", vocabs=None, data_select=("train", "valid", "test")): + if isinstance(data_select, str): + data_select = [data_select] + if not set(data_select).issubset(set(("train", "test"))): + raise TypeError("Given data selection {} is not supported!".format(data_select)) + train, val, test = DATASETS[dataset_name](root=root) raw_data = { "train": [line for line in train] if train else None, @@ -75,31 +80,44 @@ class SequenceTaggingDataset(torch.utils.data.Dataset): - UDPOS - CoNLL2000Chunking """ - def __init__(self, data, vocab, transforms): + def __init__(self, data, vocabs, transforms): """Initiate sequence tagging dataset. Arguments: data: a list of word and its respective tags. Example: [[word, POS, dep_parsing label, ...]] - vocab: Vocabulary object used for dataset. + vocabs: a list of vocabularies for its respective tags. + The number of vocabs must be the same as the number of columns + found in the data. transforms: a list of string transforms for words and tags. + The number of transforms must be the same as the number of columns + found in the data. """ super(SequenceTaggingDataset, self).__init__() self.data = data - self.vocab = vocab + self.vocabs = vocabs self.transforms = transforms + if len(self.data[0]) != len(self.vocabs): + raise ValueError("vocabs must hahve the same number of columns " + "as the data") + + if len(self.data[0]) != len(self.transforms): + raise ValueError("vocabs must hahve the same number of columns " + "as the data") + def __getitem__(self, i): - line = [] - for idx, transform in enumerate(self.transforms): - line.append(transform(self.data[i][idx])) - return line + curr_data = self.data[i] + if len(curr_data) != len(self.transforms): + raise ValueError("data must have the same number of columns " + "with transforms function") + return [self.transforms[idx](curr_data[idx]) for idx in range(self.transforms)] def __len__(self): return len(self.data) - def get_vocab(self): - return self.vocab + def get_vocabs(self): + return self.vocabs def UDPOS(*args, **kwargs): From 13864eac69191b234a930964fe364bb67dc4612a Mon Sep 17 00:00:00 2001 From: akurniawan Date: Mon, 15 Jun 2020 10:02:37 +0700 Subject: [PATCH 20/24] modify method name --- test/data/test_builtin_datasets.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index 4ffdfdbb88..bf727d5eca 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -205,13 +205,13 @@ def test_udpos_sequence_tagging(self): torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) # Assert vocabs - self.assertEqual(len(train_dataset.get_vocab()), 3) - self.assertEqual(len(train_dataset.get_vocab()[0]), 19674) - self.assertEqual(len(train_dataset.get_vocab()[1]), 19) - self.assertEqual(len(train_dataset.get_vocab()[2]), 52) + self.assertEqual(len(train_dataset.get_vocabs()), 3) + self.assertEqual(len(train_dataset.get_vocabs()[0]), 19674) + self.assertEqual(len(train_dataset.get_vocabs()[1]), 19) + self.assertEqual(len(train_dataset.get_vocabs()[2]), 52) # Assert token ids - word_vocab = train_dataset.get_vocab()[0] + word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [1206, 8, 69, 60, 157, 452]) @@ -249,13 +249,13 @@ def test_conll_sequence_tagging(self): torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long()) # Assert vocabs - self.assertEqual(len(train_dataset.get_vocab()), 3) - self.assertEqual(len(train_dataset.get_vocab()[0]), 19124) - self.assertEqual(len(train_dataset.get_vocab()[1]), 46) - self.assertEqual(len(train_dataset.get_vocab()[2]), 24) + self.assertEqual(len(train_dataset.get_vocabs()), 3) + self.assertEqual(len(train_dataset.get_vocabs()[0]), 19124) + self.assertEqual(len(train_dataset.get_vocabs()[1]), 46) + self.assertEqual(len(train_dataset.get_vocabs()[2]), 24) # Assert token ids - word_vocab = train_dataset.get_vocab()[0] + word_vocab = train_dataset.get_vocabs()[0] tokens_ids = [word_vocab[token] for token in 'Two of them were being run'.split()] self.assertEqual(tokens_ids, [970, 5, 135, 43, 214, 690]) From e3d4256427eeec4317b6e1e243d8ed82337bb231 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Mon, 15 Jun 2020 10:03:31 +0700 Subject: [PATCH 21/24] add "valid" to data_select option validation --- torchtext/experimental/datasets/sequence_tagging.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index 38395facea..baf7888e53 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -30,7 +30,7 @@ def _setup_datasets(dataset_name, data_select=("train", "valid", "test")): if isinstance(data_select, str): data_select = [data_select] - if not set(data_select).issubset(set(("train", "test"))): + if not set(data_select).issubset(set(("train", "valid", "test"))): raise TypeError("Given data selection {} is not supported!".format(data_select)) train, val, test = DATASETS[dataset_name](root=root) @@ -111,7 +111,7 @@ def __getitem__(self, i): if len(curr_data) != len(self.transforms): raise ValueError("data must have the same number of columns " "with transforms function") - return [self.transforms[idx](curr_data[idx]) for idx in range(self.transforms)] + return [self.transforms[idx](curr_data[idx]) for idx in range(len(self.transforms))] def __len__(self): return len(self.data) From 351ad46027db5e0da2e8a6e202da08cb8f34c9e1 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 17 Jun 2020 09:36:53 +0700 Subject: [PATCH 22/24] add todo for assert_allclose --- test/data/test_builtin_datasets.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index bf727d5eca..4ba6cb5a77 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -165,6 +165,8 @@ def test_udpos_sequence_tagging(self): self.assertEqual(len(train_dataset), 12543) self.assertEqual(len(valid_dataset), 2002) self.assertEqual(len(test_dataset), 2077) + # TODO: replace assert_allclose with self.assertEqual once + # https://github.com/pytorch/text/pull/822 has been landed assert_allclose(train_dataset[0][0][:10], torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) assert_allclose(train_dataset[0][1][:10], @@ -222,6 +224,8 @@ def test_conll_sequence_tagging(self): train_dataset, test_dataset = CoNLL2000Chunking() self.assertEqual(len(train_dataset), 8936) self.assertEqual(len(test_dataset), 2012) + # TODO: replace assert_allclose with self.assertEqual once + # https://github.com/pytorch/text/pull/822 has been landed assert_allclose(train_dataset[0][0][:10], torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long()) assert_allclose(train_dataset[0][1][:10], From 73ce74a81a25cd046a997cda898d2413a61d0329 Mon Sep 17 00:00:00 2001 From: akurniawan Date: Wed, 17 Jun 2020 09:39:02 +0700 Subject: [PATCH 23/24] remove duplicate validation for transforms function --- torchtext/experimental/datasets/sequence_tagging.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/torchtext/experimental/datasets/sequence_tagging.py b/torchtext/experimental/datasets/sequence_tagging.py index baf7888e53..8a6a97a55e 100644 --- a/torchtext/experimental/datasets/sequence_tagging.py +++ b/torchtext/experimental/datasets/sequence_tagging.py @@ -99,11 +99,7 @@ def __init__(self, data, vocabs, transforms): self.transforms = transforms if len(self.data[0]) != len(self.vocabs): - raise ValueError("vocabs must hahve the same number of columns " - "as the data") - - if len(self.data[0]) != len(self.transforms): - raise ValueError("vocabs must hahve the same number of columns " + raise ValueError("vocabs must have the same number of columns " "as the data") def __getitem__(self, i): From e4ba11cb00d018e25da95dbcf9205e16e81ddb6d Mon Sep 17 00:00:00 2001 From: akurniawan Date: Fri, 19 Jun 2020 09:51:32 +0700 Subject: [PATCH 24/24] replace assert_allclose with self.assertEqual --- test/data/test_builtin_datasets.py | 130 ++++++++++++++--------------- 1 file changed, 63 insertions(+), 67 deletions(-) diff --git a/test/data/test_builtin_datasets.py b/test/data/test_builtin_datasets.py index b6d24cf077..1c120da16c 100644 --- a/test/data/test_builtin_datasets.py +++ b/test/data/test_builtin_datasets.py @@ -164,46 +164,44 @@ def test_udpos_sequence_tagging(self): self.assertEqual(len(train_dataset), 12543) self.assertEqual(len(valid_dataset), 2002) self.assertEqual(len(test_dataset), 2077) - # TODO: replace assert_allclose with self.assertEqual once - # https://github.com/pytorch/text/pull/822 has been landed - assert_allclose(train_dataset[0][0][:10], - torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) - assert_allclose(train_dataset[0][1][:10], - torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) - assert_allclose(train_dataset[0][2][:10], - torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) - assert_allclose(train_dataset[-1][0][:10], - torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) - assert_allclose(train_dataset[-1][1][:10], - torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) - assert_allclose(train_dataset[-1][2][:10], - torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) - - assert_allclose(valid_dataset[0][0][:10], - torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) - assert_allclose(valid_dataset[0][1][:10], - torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) - assert_allclose(valid_dataset[0][2][:10], - torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) - assert_allclose(valid_dataset[-1][0][:10], - torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) - assert_allclose(valid_dataset[-1][1][:10], - torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) - assert_allclose(valid_dataset[-1][2][:10], - torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) - - assert_allclose(test_dataset[0][0][:10], - torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) - assert_allclose(test_dataset[0][1][:10], - torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) - assert_allclose(test_dataset[0][2][:10], - torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) - assert_allclose(test_dataset[-1][0][:10], - torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) - assert_allclose(test_dataset[-1][1][:10], - torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) - assert_allclose(test_dataset[-1][2][:10], - torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) + self.assertEqual(train_dataset[0][0][:10], + torch.tensor([262, 16, 5728, 45, 289, 701, 1160, 4436, 10660, 585]).long()) + self.assertEqual(train_dataset[0][1][:10], + torch.tensor([8, 3, 8, 3, 9, 2, 4, 8, 8, 8]).long()) + self.assertEqual(train_dataset[0][2][:10], + torch.tensor([5, 34, 5, 27, 7, 11, 14, 5, 5, 5]).long()) + self.assertEqual(train_dataset[-1][0][:10], + torch.tensor([9, 32, 169, 436, 59, 192, 30, 6, 117, 17]).long()) + self.assertEqual(train_dataset[-1][1][:10], + torch.tensor([5, 10, 11, 4, 11, 11, 3, 12, 11, 4]).long()) + self.assertEqual(train_dataset[-1][2][:10], + torch.tensor([6, 20, 8, 10, 8, 8, 24, 13, 8, 15]).long()) + + self.assertEqual(valid_dataset[0][0][:10], + torch.tensor([746, 3, 10633, 656, 25, 1334, 45]).long()) + self.assertEqual(valid_dataset[0][1][:10], + torch.tensor([6, 7, 8, 4, 7, 2, 3]).long()) + self.assertEqual(valid_dataset[0][2][:10], + torch.tensor([3, 4, 5, 16, 4, 2, 27]).long()) + self.assertEqual(valid_dataset[-1][0][:10], + torch.tensor([354, 4, 31, 17, 141, 421, 148, 6, 7, 78]).long()) + self.assertEqual(valid_dataset[-1][1][:10], + torch.tensor([11, 3, 5, 4, 9, 2, 2, 12, 7, 11]).long()) + self.assertEqual(valid_dataset[-1][2][:10], + torch.tensor([8, 12, 6, 15, 7, 2, 2, 13, 4, 8]).long()) + + self.assertEqual(test_dataset[0][0][:10], + torch.tensor([210, 54, 3115, 0, 12229, 0, 33]).long()) + self.assertEqual(test_dataset[0][1][:10], + torch.tensor([5, 15, 8, 4, 6, 8, 3]).long()) + self.assertEqual(test_dataset[0][2][:10], + torch.tensor([30, 3, 5, 14, 3, 5, 9]).long()) + self.assertEqual(test_dataset[-1][0][:10], + torch.tensor([116, 0, 6, 11, 412, 10, 0, 4, 0, 6]).long()) + self.assertEqual(test_dataset[-1][1][:10], + torch.tensor([5, 4, 12, 10, 9, 15, 4, 3, 4, 12]).long()) + self.assertEqual(test_dataset[-1][2][:10], + torch.tensor([6, 16, 13, 16, 7, 3, 19, 12, 19, 13]).long()) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3) @@ -223,33 +221,31 @@ def test_conll_sequence_tagging(self): train_dataset, test_dataset = CoNLL2000Chunking() self.assertEqual(len(train_dataset), 8936) self.assertEqual(len(test_dataset), 2012) - # TODO: replace assert_allclose with self.assertEqual once - # https://github.com/pytorch/text/pull/822 has been landed - assert_allclose(train_dataset[0][0][:10], - torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long()) - assert_allclose(train_dataset[0][1][:10], - torch.tensor([2, 3, 5, 2, 17, 12, 16, 15, 13, 5]).long()) - assert_allclose(train_dataset[0][2][:10], - torch.tensor([3, 6, 3, 2, 5, 7, 7, 7, 7, 3]).long()) - assert_allclose(train_dataset[-1][0][:10], - torch.tensor([85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502]).long()) - assert_allclose(train_dataset[-1][1][:10], - torch.tensor([18, 17, 12, 19, 10, 6, 3, 3, 4, 4]).long()) - assert_allclose(train_dataset[-1][2][:10], - torch.tensor([3, 5, 7, 7, 3, 2, 6, 6, 3, 2]).long()) - - assert_allclose(test_dataset[0][0][:10], - torch.tensor([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7]).long()) - assert_allclose(test_dataset[0][1][:10], - torch.tensor([4, 4, 4, 23, 4, 2, 11, 18, 11, 5]).long()) - assert_allclose(test_dataset[0][2][:10], - torch.tensor([3, 2, 2, 3, 2, 2, 5, 3, 5, 3]).long()) - assert_allclose(test_dataset[-1][0][:10], - torch.tensor([51, 456, 560, 2, 11, 465, 2, 1413, 36, 60]).long()) - assert_allclose(test_dataset[-1][1][:10], - torch.tensor([3, 4, 4, 8, 3, 2, 8, 4, 17, 16]).long()) - assert_allclose(test_dataset[-1][2][:10], - torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long()) + self.assertEqual(train_dataset[0][0][:10], + torch.tensor([11556, 9, 3, 1775, 17, 1164, 177, 6, 212, 317]).long()) + self.assertEqual(train_dataset[0][1][:10], + torch.tensor([2, 3, 5, 2, 17, 12, 16, 15, 13, 5]).long()) + self.assertEqual(train_dataset[0][2][:10], + torch.tensor([3, 6, 3, 2, 5, 7, 7, 7, 7, 3]).long()) + self.assertEqual(train_dataset[-1][0][:10], + torch.tensor([85, 17, 59, 6473, 288, 115, 72, 5, 2294, 2502]).long()) + self.assertEqual(train_dataset[-1][1][:10], + torch.tensor([18, 17, 12, 19, 10, 6, 3, 3, 4, 4]).long()) + self.assertEqual(train_dataset[-1][2][:10], + torch.tensor([3, 5, 7, 7, 3, 2, 6, 6, 3, 2]).long()) + + self.assertEqual(test_dataset[0][0][:10], + torch.tensor([0, 294, 73, 10, 13582, 194, 18, 24, 2414, 7]).long()) + self.assertEqual(test_dataset[0][1][:10], + torch.tensor([4, 4, 4, 23, 4, 2, 11, 18, 11, 5]).long()) + self.assertEqual(test_dataset[0][2][:10], + torch.tensor([3, 2, 2, 3, 2, 2, 5, 3, 5, 3]).long()) + self.assertEqual(test_dataset[-1][0][:10], + torch.tensor([51, 456, 560, 2, 11, 465, 2, 1413, 36, 60]).long()) + self.assertEqual(test_dataset[-1][1][:10], + torch.tensor([3, 4, 4, 8, 3, 2, 8, 4, 17, 16]).long()) + self.assertEqual(test_dataset[-1][2][:10], + torch.tensor([6, 3, 2, 4, 6, 3, 4, 3, 5, 7]).long()) # Assert vocabs self.assertEqual(len(train_dataset.get_vocabs()), 3)