|
1 | 1 | """ |
2 | | -Sequence-to-Sequence Modeling with nn.Transformer and TorchText |
| 2 | +Language Modeling with nn.Transformer and TorchText |
3 | 3 | =============================================================== |
4 | 4 |
|
5 | 5 | This is a tutorial on how to train a sequence-to-sequence model |
6 | 6 | that uses the |
7 | | -`nn.Transformer <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformer#torch.nn.Transformer>`__ module. |
| 7 | +`nn.Transformer <https://pytorch.org/docs/stable/generated/torch.nn.Transformer.html>`__ module. |
8 | 8 |
|
9 | 9 | PyTorch 1.2 release includes a standard transformer module based on the |
10 | 10 | paper `Attention is All You |
11 | 11 | Need <https://arxiv.org/pdf/1706.03762.pdf>`__. The transformer model |
12 | 12 | has been proved to be superior in quality for many sequence-to-sequence |
13 | 13 | problems while being more parallelizable. The ``nn.Transformer`` module |
14 | 14 | relies entirely on an attention mechanism (another module recently |
15 | | -implemented as `nn.MultiheadAttention <https://pytorch.org/docs/master/nn.html?highlight=multiheadattention#torch.nn.MultiheadAttention>`__) to draw global dependencies |
| 15 | +implemented as `nn.MultiheadAttention <https://pytorch.org/docs/stable/generated/torch.nn.MultiheadAttention.html>`__) to draw global dependencies |
16 | 16 | between input and output. The ``nn.Transformer`` module is now highly |
17 | | -modularized such that a single component (like `nn.TransformerEncoder <https://pytorch.org/docs/master/nn.html?highlight=nn%20transformerencoder#torch.nn.TransformerEncoder>`__ |
| 17 | +modularized such that a single component (like `nn.TransformerEncoder <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoder.html>`__ |
18 | 18 | in this tutorial) can be easily adapted/composed. |
19 | 19 |
|
20 | 20 | .. image:: ../_static/img/transformer_architecture.jpg |
|
35 | 35 | # layer first, followed by a positional encoding layer to account for the order |
36 | 36 | # of the word (see the next paragraph for more details). The |
37 | 37 | # ``nn.TransformerEncoder`` consists of multiple layers of |
38 | | -# `nn.TransformerEncoderLayer <https://pytorch.org/docs/master/nn.html?highlight=transformerencoderlayer#torch.nn.TransformerEncoderLayer>`__. Along with the input sequence, a square |
| 38 | +# `nn.TransformerEncoderLayer <https://pytorch.org/docs/stable/generated/torch.nn.TransformerEncoderLayer.html>`__. Along with the input sequence, a square |
39 | 39 | # attention mask is required because the self-attention layers in |
40 | 40 | # ``nn.TransformerEncoder`` are only allowed to attend the earlier positions in |
41 | 41 | # the sequence. For the language modeling task, any tokens on the future |
@@ -144,23 +144,18 @@ def forward(self, x): |
144 | 144 | # efficient batch processing. |
145 | 145 | # |
146 | 146 |
|
147 | | -import io |
148 | 147 | import torch |
149 | 148 | from torchtext.datasets import WikiText2 |
150 | 149 | from torchtext.data.utils import get_tokenizer |
151 | | -from collections import Counter |
152 | | -from torchtext.vocab import Vocab |
| 150 | +from torchtext.vocab import build_vocab_from_iterator |
153 | 151 |
|
154 | 152 | train_iter = WikiText2(split='train') |
155 | 153 | tokenizer = get_tokenizer('basic_english') |
156 | | -counter = Counter() |
157 | | -for line in train_iter: |
158 | | - counter.update(tokenizer(line)) |
159 | | -vocab = Vocab(counter) |
| 154 | +vocab = build_vocab_from_iterator(map(tokenizer, train_iter), specials=["<unk>"]) |
| 155 | +vocab.set_default_index(vocab["<unk>"]) |
160 | 156 |
|
161 | 157 | def data_process(raw_text_iter): |
162 | | - data = [torch.tensor([vocab[token] for token in tokenizer(item)], |
163 | | - dtype=torch.long) for item in raw_text_iter] |
| 158 | + data = [torch.tensor(vocab(tokenizer(item)), dtype=torch.long) for item in raw_text_iter] |
164 | 159 | return torch.cat(tuple(filter(lambda t: t.numel() > 0, data))) |
165 | 160 |
|
166 | 161 | train_iter, val_iter, test_iter = WikiText2() |
@@ -225,7 +220,7 @@ def get_batch(source, i): |
225 | 220 | # equal to the length of the vocab object. |
226 | 221 | # |
227 | 222 |
|
228 | | -ntokens = len(vocab.stoi) # the size of vocabulary |
| 223 | +ntokens = len(vocab) # the size of vocabulary |
229 | 224 | emsize = 200 # embedding dimension |
230 | 225 | nhid = 200 # the dimension of the feedforward network model in nn.TransformerEncoder |
231 | 226 | nlayers = 2 # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder |
|
0 commit comments