add partial broadcast support for ScaledDotProduct. Only allow the batch dim of either query or key/value to be 1

Guanheng Zhang · Guanheng Zhang · commit 2b9b68c2028a · 2020-04-16T13:28:35.000-07:00
diff --git a/test/data/test_models.py b/test/data/test_models.py
@@ -39,3 +39,29 @@ def test_multiheadattention(self):
         assert_allclose(mha_output, torch_mha_output)
         attn_weights = attn_weights.view(bsz, nhead, tgt_len, src_len).sum(dim=1) / nhead
         assert_allclose(attn_weights, torch_mha_weights)
+
+    def test_broadcast_scaled_dot_product(self):
+        embed_dim, nhead, tgt_len, src_len, bsz = 10, 5, 6, 10, 64
+        SDP = ScaledDotProduct(nhead)
+        query = torch.rand((tgt_len, 1, embed_dim))
+        key = value = torch.rand((src_len, 1, embed_dim))
+        attn_mask_2D = torch.randint(0, 2, (tgt_len, src_len)).to(torch.bool)
+
+        sdp_attn_output_full, sdp_attn_weights_full = SDP(query.expand(tgt_len, bsz * nhead, embed_dim),
+                                                          key.expand(src_len, bsz * nhead, embed_dim),
+                                                          value.expand(src_len, bsz * nhead, embed_dim),
+                                                          attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+
+        # query has a batch size of 1 while key/value have a batch size of bsz * nhead
+        sdp_attn_output, sdp_attn_weights = SDP(query, key.expand(src_len, bsz * nhead, embed_dim),
+                                                value.expand(src_len, bsz * nhead, embed_dim),
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert_allclose(sdp_attn_output, sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights, sdp_attn_weights_full)
+
+        # key/value have a batch size of 1 while query has a batch size of bsz * nhead
+        sdp_attn_output, sdp_attn_weights = SDP(query.expand(tgt_len, bsz * nhead, embed_dim),
+                                                key, value,
+                                                attn_mask=attn_mask_2D.expand(bsz * nhead, tgt_len, src_len))
+        assert_allclose(sdp_attn_output, sdp_attn_output_full)
+        assert_allclose(sdp_attn_weights, sdp_attn_weights_full)
diff --git a/torchtext/models/multiheadattention.py b/torchtext/models/multiheadattention.py
@@ -185,19 +185,24 @@ def forward(self, query, key, value, attn_mask=None):
             where L is the target length, S is the source length, H is the number
             of attention heads, N is the batch size, and E is the embedding dimension.
         """
-        tgt_len, batch_heads, head_dim = query.size()
-        assert query.size(1) == key.size(1) == value.size(1), "Dimension 0 of query, key, value must be equal."
-        assert batch_heads % self.num_heads == 0, "Dimension 0 of query, key, value must be divisible by num_heads"
+        tgt_len, head_dim = query.size(-3), query.size(-1)
+        assert query.size(-1) == key.size(-1) == value.size(-1), "The feature dim of query, key, value must be equal."
         assert key.size() == value.size(), "Shape of key, value must match"
-        assert query.size(-1) == key.size(-1), "The head dimension of query must be equal to that of key"
-        src_len = key.size(0)
+        src_len = key.size(-3)
+        batch_heads = max(query.size(-2), key.size(-2))
 
         # Scale query
-        query, key, value = query.transpose(0, 1), key.transpose(0, 1), value.transpose(0, 1)
+        query, key, value = query.transpose(-2, -3), key.transpose(-2, -3), value.transpose(-2, -3)
         query = query * (float(head_dim) ** -0.5)
         if attn_mask is not None:
-            if list(attn_mask.size()) != [batch_heads, tgt_len, src_len]:
-                raise RuntimeError('The size of the 3D attn_mask is not correct.')
+            if attn_mask.dim() != 3:
+                raise RuntimeError('attn_mask must be a 3D tensor.')
+            print(attn_mask.size(-1), src_len)
+            print(attn_mask.size(-2), tgt_len)
+            print(attn_mask.size(-3), batch_heads)
+            if (attn_mask.size(-1) == src_len) and (attn_mask.size(-2) == tgt_len) and \
+               (attn_mask.size(-3) == 1 or attn_mask.size(-3) == batch_heads):
+                raise RuntimeError('The size of the attn_mask is not correct.')
             if attn_mask.dtype != torch.bool:
                 raise RuntimeError('Only bool tensor is supported for attn_mask')
 
@@ -211,4 +216,4 @@ def forward(self, query, key, value, attn_mask=None):
         attn_output_weights = torch.nn.functional.softmax(attn_output_weights, dim=-1)
         attn_output_weights = torch.nn.functional.dropout(attn_output_weights, p=self.dropout, training=self.training)
         attn_output = torch.matmul(attn_output_weights, value)
-        return attn_output.transpose(0, 1), attn_output_weights
+        return attn_output.transpose(-2, -3), attn_output_weights