Update base for Update on "computing attention scores using relative attention bias"

pmabbo13 · pmabbo13 · commit c8c6dbb97817 · 2022-07-14T16:16:27.000-04:00
WIP PR to workshop implementation: #1812 [ghstack-poisoned]
diff --git a/torchtext/prototype/t5/modules.py b/torchtext/prototype/t5/modules.py
@@ -9,8 +9,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-# Original code is taken from
-# https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+# Parts of code are originally from
+# https://github.com/huggingface/transformers/blob/8581a798c0a48fca07b29ce2ca2ef55adcae8c7e/src/transformers/models/t5/modeling_t5.py
 # */
 
 import math
@@ -28,32 +28,22 @@ def __init__(
         is_decoder=False,
         dropout=0.0,
         bias=False,
-        add_bias_kv=False,
-        add_zero_attn=False,
         kdim=None,
         vdim=None,
-        batch_first=False,
         device=None,
         dtype=None,
     ) -> None:
         r"""
         Args:
-            embed_dim: total dimension of the model.
-            num_heads: parallel attention heads.
-            is_decoder: whether or not multihead attention is being performed on a decoder layer. Default: ``False``
-            dropout: probability of an element to be zeroed. Default: 0.0
-            bias: If specified, adds bias to input / output projection layers. Default: ``False``.
-            add_bias_kv: If specified, adds bias to the key and value sequences at dim=0. Default: ``False``.
-            add_zero_attn: If specified, adds a new batch of zeros to the key and value sequences at dim=1.
-                Default: ``False``.
-            kdim: Total number of features for keys. Default: ``None`` (uses ``kdim=embed_dim``).
-            vdim: Total number of features for values. Default: ``None`` (uses ``vdim=embed_dim``).
-            batch_first: If ``True``, then the input and output tensors are provided
-                as (batch, seq, feature). Default: ``False`` (seq, batch, feature).
+            embed_dim: Total dimension of the model.
+            num_heads: Parallel attention heads.
+            is_decoder: Whether or not multihead attention is being performed on a decoder layer. Default: `False`
+            dropout: Probability of an element to be zeroed. Default: 0.0
+            bias: If specified, adds bias to input / output projection layers. Default: `False`.
+            kdim: Total number of features for keys. Default: `None` (uses `kdim=embed_dim`).
+            vdim: Total number of features for values. Default: `None` (uses `vdim=embed_dim`).
         """
-        super().__init__(
-            embed_dim, num_heads, dropout, bias, add_bias_kv, add_zero_attn, kdim, vdim, batch_first, device, dtype
-        )
+        super().__init__(embed_dim, num_heads, dropout, bias, False, False, kdim, vdim, True, device, dtype)
         factory_kwargs = {"device": device, "dtype": dtype}
         self.is_decoder = is_decoder
         self.q_proj_weight = nn.Parameter(torch.empty((embed_dim, embed_dim), **factory_kwargs))
@@ -64,7 +54,7 @@ def __init__(
     def forward():
         pass
 
-    # NOTE: modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+    # NOTE: modified from https://github.com/huggingface/transformers/blob/8581a798c0a48fca07b29ce2ca2ef55adcae8c7e/src/transformers/models/t5/modeling_t5.py#L421
     def _compute_bias(
         self,
         query_length: int,
@@ -91,7 +81,7 @@ def _compute_bias(
         values = values.permute([2, 0, 1]).unsqueeze(0)  # shape (1, num_heads, query_length, key_length)
         return values
 
-    # NOTE: taken from https://github.com/huggingface/transformers/blob/main/src/transformers/models/t5/modeling_t5.py
+    # NOTE: Taken from https://github.com/huggingface/transformers/blob/8581a798c0a48fca07b29ce2ca2ef55adcae8c7e/src/transformers/models/t5/modeling_t5.py#L374
     def _relative_position_bucket(
         self, relative_position: Tensor, bidirectional: bool = True, num_buckets: int = 32, max_distance: int = 128
     ):
@@ -119,9 +109,9 @@ def _relative_position_bucket(
             relative_position = torch.abs(relative_position)
         else:
             relative_position = -torch.min(relative_position, torch.zeros_like(relative_position))
-        # now relative_position is in the range [0, inf)
+        # Ensure relative_position is in the range [0, inf)
 
-        # half of the buckets are for exact increments in positions
+        # Half of the buckets are for exact increments in positions
         max_exact = num_buckets // 2
         is_small = relative_position < max_exact