pytorch · cpuhrsch · Jun 16, 2020 · Apr 2, 2020 · Apr 2, 2020 · Apr 2, 2020
diff --git a/.flake8 b/.flake8
@@ -1,4 +1,5 @@
 [flake8]
-ignore = E402,E722,W503,W504,F821
+# E501 is not flexible enough, we're using B950 instead. Consistent with pytorch
+ignore = E402,E722,W503,W504,F821,E501
 max-line-length = 120
 exclude = docs/source,third_party
diff --git a/benchmark/mha_block.py b/benchmark/mha_block.py
@@ -0,0 +1,103 @@
+import torch
+from torchtext.modules import InProjContainer, MultiheadAttentionContainer, ScaledDotProduct
+from torch.nn.functional import multi_head_attention_forward as mha_forward
+import time
+
+
+def benchmark_mha_block():
+
+    def _run_benchmark(embed_dim, nhead, bsz, device, tgt_len, src_len=None):
+        # Build torchtext MultiheadAttention module
+        in_proj_container = InProjContainer(torch.nn.Linear(embed_dim, embed_dim),
+                                            torch.nn.Linear(embed_dim, embed_dim),
+                                            torch.nn.Linear(embed_dim, embed_dim))
+        MHA = MultiheadAttentionContainer(nhead, in_proj_container,
+                                          ScaledDotProduct(),
+                                          torch.nn.Linear(embed_dim, embed_dim)).to(device)
+
+        query = torch.rand((tgt_len, bsz, embed_dim)).to(device)
+        if src_len is None:
+            key = value = query
+            src_len = tgt_len
+        else:
+            key = value = torch.rand((src_len, bsz, embed_dim)).to(device)
+        attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len)).to(torch.bool).to(device)
+        attn_mask = torch.stack([attn_mask_2D] * (bsz * nhead))
+        bias_k = bias_v = torch.rand((1, 1, embed_dim)).to(device)
+        print("starting torchtext.modules.MultiheadAttentionContainer")
+        if device == torch.device("cuda"):
+            torch.cuda.synchronize()
+        t0 = time.monotonic()
+        for _ in range(100):
+            mha_output, attn_weights = MHA(query, key, value,
+                                           attn_mask=attn_mask,
+                                           bias_k=bias_k.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1),
+                                           bias_v=bias_v.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1))
+        if device == torch.device("cuda"):
+            torch.cuda.synchronize()
+        print(time.monotonic() - t0)
+
+        # Use torch.nn.functional.multi_head_attention_forward
+        torch_attn_mask = torch.zeros((tgt_len, src_len)).to(device).masked_fill_(attn_mask_2D, float('-inf'))
+        print("starting torch.nn.functional.multi_head_attention_forward")
+        in_proj_weight = torch.cat([MHA.in_proj_container.query_proj.weight,
+                                    MHA.in_proj_container.key_proj.weight,
+                                    MHA.in_proj_container.value_proj.weight])
+        if device == torch.device("cuda"):
+            torch.cuda.synchronize()
+        t0 = time.monotonic()
+        for _ in range(100):
+            torch_mha_output, torch_mha_weights = mha_forward(query, key, value,
+                                                              embed_dim, nhead,
+                                                              in_proj_weight, None,
+                                                              bias_k, bias_v,
+                                                              False, 0.0,
+                                                              MHA.out_proj.weight,
+                                                              MHA.out_proj.bias,
+                                                              attn_mask=torch_attn_mask)
+        if device == torch.device("cuda"):
+            torch.cuda.synchronize()
+        print(time.monotonic() - t0)
+
+    # GPU test
+    device = torch.device("cuda")
+    for embed_dim in [64, 768]:
+        for nhead in [2, 16]:
+            for seq_len in [10, 128, 1000]:
+                for bsz in [2, 72]:
+                    if seq_len == 1000 and bsz == 72:
+                        continue
+                    print("*" * 80)
+                    print("test case GPU with embed_dim, nhead, seq_len, bsz:",
+                          embed_dim, nhead, seq_len, seq_len, bsz)
+                    _run_benchmark(embed_dim, nhead, bsz, device, seq_len, seq_len)
+
+    # GPU test for self-attention
+    device = torch.device("cuda")
+    for embed_dim in [64, 256]:
+        for nhead in [2, 16]:
+            for seq_len in [10, 128, 1000]:
+                for bsz in [2, 72]:
+                    if seq_len == 1000 and bsz == 72:
+                        continue
+                    print("*" * 80)
+                    print("self-attention test case GPU with embed_dim, nhead, seq_len, bsz:",
+                          embed_dim, nhead, seq_len, seq_len, bsz)
+                    _run_benchmark(embed_dim, nhead, bsz, device, seq_len, None)
+
+    # CPU test for self-attention
+    device = torch.device("cpu")
+    for embed_dim in [64, 768]:
+        for nhead in [2, 16]:
+            for seq_len in [10, 128, 1000]:
+                for bsz in [2, 72]:
+                    if seq_len == 1000 and bsz == 72:
+                        continue
+                    print("*" * 80)
+                    print("test case CPU with embed_dim, nhead, seq_len, bsz:",
+                          embed_dim, nhead, seq_len, seq_len, bsz)
+                    _run_benchmark(embed_dim, nhead, bsz, device, seq_len, None)
+
+
+if __name__ == "__main__":
+    benchmark_mha_block()
diff --git a/docs/source/modules.rst b/docs/source/modules.rst
@@ -0,0 +1,23 @@
+.. role:: hidden
+    :class: hidden-section
+
+torchtext.models.multiheadattention
+==================================
+
+.. automodule:: torchtext.models.multiheadattention
+.. currentmodule:: torchtext.models.multiheadattention
+
+:hidden:`MultiheadAttentionContainer`
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: MultiheadAttentionContainer 
+
+:hidden:`InProjContainer`
+~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: InProjContainer 
+
+:hidden:`ScaledDotProduct`
+~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autofunction:: ScaledDotProduct
diff --git a/test/data/test_jit.py b/test/data/test_jit.py
@@ -0,0 +1,28 @@
+import torch
+from torchtext.modules import InProjContainer, MultiheadAttentionContainer, ScaledDotProduct
+from torch.testing import assert_allclose
+from ..common.torchtext_test_case import TorchtextTestCase
+
+
+class TestJIT(TorchtextTestCase):
+
+    def test_torchscript_multiheadattention(self):
+        embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
+        # Build torchtext MultiheadAttention models
+        in_proj_container = InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                            torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                            torch.nn.Linear(embed_dim, embed_dim, bias=False))
+
+        MHA = MultiheadAttentionContainer(nhead, in_proj_container,
+                                          ScaledDotProduct(),
+                                          torch.nn.Linear(embed_dim, embed_dim, bias=False))
+        query = torch.rand((tgt_len, bsz, embed_dim))
+        key = value = torch.rand((src_len, bsz, embed_dim))
+        attn_mask = torch.randint(0, 2, (tgt_len, src_len)).to(torch.bool)
+        attn_mask = torch.stack([attn_mask] * (bsz * nhead))
+        mha_output, attn_weights = MHA(query, key, value, attn_mask=attn_mask)
+
+        ts_MHA = torch.jit.script(MHA)
+        ts_mha_output, ts_attn_weights = ts_MHA(query, key, value, attn_mask=attn_mask)
+        assert_allclose(mha_output, ts_mha_output)
+        assert_allclose(attn_weights, ts_attn_weights)
diff --git a/test/data/test_modules.py b/test/data/test_modules.py
@@ -0,0 +1,123 @@
+import torch
+from torchtext.modules import InProjContainer, MultiheadAttentionContainer, ScaledDotProduct
+from torch.nn.functional import multi_head_attention_forward as mha_forward
+from torch.testing import assert_allclose
+from ..common.torchtext_test_case import TorchtextTestCase
+
+
+class TestModels(TorchtextTestCase):
+
+    def test_multiheadattention(self):
+        embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
+        # Build torchtext MultiheadAttention module
+        in_proj = InProjContainer(torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                  torch.nn.Linear(embed_dim, embed_dim, bias=False),
+                                  torch.nn.Linear(embed_dim, embed_dim, bias=False))
+
+        MHA = MultiheadAttentionContainer(nhead, in_proj,
+                                          ScaledDotProduct(),
+                                          torch.nn.Linear(embed_dim, embed_dim, bias=False))
+
+        query = torch.rand((tgt_len, bsz, embed_dim))
+        key = value = torch.rand((src_len, bsz, embed_dim))
+        attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len)).to(torch.bool)
+        bias_k = bias_v = torch.rand((1, 1, embed_dim))
+        mha_output, attn_weights = MHA(query, key, value,
+                                       attn_mask=torch.stack([attn_mask_2D] * (bsz * nhead)),
+                                       bias_k=bias_k.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1),
+                                       bias_v=bias_v.repeat(1, bsz, 1).reshape(1, bsz * nhead, -1))
+
+        # Use torch.nn.functional.multi_head_attention_forward
+        torch_attn_mask = torch.zeros((tgt_len, src_len)).masked_fill_(attn_mask_2D, float('-inf'))
+        in_proj_weight = torch.cat([MHA.in_proj_container.query_proj.weight,
+                                    MHA.in_proj_container.key_proj.weight,
+                                    MHA.in_proj_container.value_proj.weight])
+        torch_mha_output, torch_mha_weights = mha_forward(query, key, value,
+                                                          embed_dim, nhead,
+                                                          in_proj_weight, None,
+                                                          bias_k, bias_v,
+                                                          False, 0.0,
+                                                          MHA.out_proj.weight, None,
+                                                          attn_mask=torch_attn_mask)
+
+        assert_allclose(mha_output, torch_mha_output)
+        # With bias_k and bias_v, src_len needs to plus 1
+        attn_weights = attn_weights.view(bsz, nhead, tgt_len, src_len + 1).sum(dim=1) / nhead
+        assert_allclose(attn_weights, torch_mha_weights)
+
+    def test_broadcast_scaled_dot_product(self):
+        embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
+        SDP = ScaledDotProduct()
+        query = torch.rand((tgt_len, 1, embed_dim))
+        key = value = torch.rand((src_len, 1, embed_dim))
+        attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len)).to(torch.bool)
+
+        sdp_attn_output_full, sdp_attn_weights_full = SDP(query.expand(tgt_len, bsz * nhead, embed_dim),
+                                                          key.expand(src_len, bsz * nhead, embed_dim),
+                                                          value.expand(src_len, bsz * nhead, embed_dim),
+                                                          attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+
+        # query has a batch size of 1 while key/value have a batch size of bsz * nhead
+        sdp_attn_output, sdp_attn_weights = SDP(query, key.expand(src_len, bsz * nhead, embed_dim),
+                                                value.expand(src_len, bsz * nhead, embed_dim),
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert_allclose(sdp_attn_output, sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights, sdp_attn_weights_full)
+
+        # key/value have a batch size of 1 while query has a batch size of bsz * nhead
+        sdp_attn_output, sdp_attn_weights = SDP(query.expand(tgt_len, bsz * nhead, embed_dim),
+                                                key, value,
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert_allclose(sdp_attn_output, sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights, sdp_attn_weights_full)
+
+        # key/value have a size of (3, 3, src_len, bsz * nhead, embed_dim)
+        # while query has a size of (tgt_len, 1, embed_dim)
+        sdp_attn_output, sdp_attn_weights = SDP(query.expand(tgt_len, 1, embed_dim),
+                                                key.expand(3, 3, src_len, bsz * nhead, embed_dim),
+                                                value.expand(3, 3, src_len, bsz * nhead, embed_dim),
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert list(sdp_attn_output.size()) == [3, 3, tgt_len, bsz * nhead, embed_dim]
+        assert list(sdp_attn_weights.size()) == [3, 3, bsz * nhead, tgt_len, embed_dim]
+        assert_allclose(sdp_attn_output[2][2], sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights[2][2], sdp_attn_weights_full)
+        # dim -2 is not equal to neither key/value's dim -2 or 1
+        with self.assertRaises(RuntimeError):
+            SDP(query.expand(tgt_len, 2, embed_dim), key.expand(3, 3, src_len, bsz * nhead, embed_dim),
+                value.expand(3, 3, src_len, bsz * nhead, embed_dim),
+                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+
+        # key/value have a size of (src_len, 1, embed_dim)
+        # while query has a size of (1, 2, 3, tgt_len, bsz * nhead, embed_dim)
+        sdp_attn_output, sdp_attn_weights = SDP(query.expand(1, 2, 3, tgt_len, bsz * nhead, embed_dim),
+                                                key.expand(src_len, 1, embed_dim),
+                                                value.expand(src_len, 1, embed_dim),
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert list(sdp_attn_output.size()) == [1, 2, 3, tgt_len, bsz * nhead, embed_dim]
+        assert list(sdp_attn_weights.size()) == [1, 2, 3, bsz * nhead, tgt_len, embed_dim]
+        assert_allclose(sdp_attn_output[0][1][2], sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights[0][1][2], sdp_attn_weights_full)
+        # key dim -2 is not equal to value dim -2
+        with self.assertRaisesRegex(AssertionError, "Shape of key, value must match"):
+            SDP(query.expand(1, 2, 3, tgt_len, bsz * nhead, embed_dim), key.expand(src_len, 2, embed_dim),
+                value.expand(src_len, 1, embed_dim),
+                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        # key/value dim -2 is not equal to neither query's dim -2 or 1
+        with self.assertRaises(RuntimeError):
+            SDP(query.expand(1, 2, 3, tgt_len, bsz * nhead, embed_dim), key.expand(src_len, 2, embed_dim),
+                value.expand(src_len, 2, embed_dim),
+                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+
+        # attn_mask in a size of (1, tgt_len, src_len)
+        # 2D tensor is not supported for attn_mask
+        sdp_attn_output, sdp_attn_weights = SDP(query.expand(tgt_len, bsz * nhead, embed_dim),
+                                                key.expand(src_len, bsz * nhead, embed_dim),
+                                                value.expand(src_len, bsz * nhead, embed_dim),
+                                                attn_mask=attn_mask_2D.expand(1, tgt_len, src_len))
+        assert_allclose(sdp_attn_output, sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights, sdp_attn_weights_full)
+        # attn_mask's dim -3 is not equal to neither batch size or 1
+        with self.assertRaisesRegex(RuntimeError, "The size of the attn_mask is not correct."):
+            SDP(query.expand(tgt_len, bsz * nhead, embed_dim), key.expand(src_len, bsz * nhead, embed_dim),
+                value.expand(src_len, bsz * nhead, embed_dim),
+                attn_mask=attn_mask_2D.expand(2, tgt_len, src_len))
diff --git a/torchtext/__init__.py b/torchtext/__init__.py
@@ -1,4 +1,5 @@
 from . import data
+from . import modules
 from . import datasets
 from . import utils
 from . import vocab
@@ -11,6 +12,7 @@
     pass
 
 __all__ = ['data',
+           'modules',
            'datasets',
            'utils',
            'vocab',

diff --git a/torchtext/modules/__init__.py b/torchtext/modules/__init__.py
@@ -0,0 +1,6 @@
+from .multiheadattention import InProjContainer, \
+    MultiheadAttentionContainer, ScaledDotProduct
+
+__all__ = ['InProjContainer',
+           'MultiheadAttentionContainer',
+           'ScaledDotProduct']