support cross-attention masking for XFormersCrossAttnProcessor

Birch-san · Birch-san · commit 790b155ec2bd · 2023-03-09T23:17:45.000Z
diff --git a/src/diffusers/models/cross_attention.py b/src/diffusers/models/cross_attention.py
@@ -11,7 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Callable, Optional, Union
+from typing import Callable, Optional, Union, Dict, Any
 
 import torch
 import torch.nn.functional as F
@@ -198,7 +198,13 @@ def set_processor(self, processor: "AttnProcessor"):
 
         self.processor = processor
 
-    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, **cross_attention_kwargs):
+    def forward(
+        self,
+        hidden_states: FloatTensor,
+        encoder_hidden_states: Optional[FloatTensor] = None,
+        attention_mask: Optional[FloatTensor] = None,
+        **cross_attention_kwargs: Dict[str, Any]
+    ):
         # The `CrossAttention` class can call different attention processors / attention functions
         # here we simply pass along all tensors to the selected processor class
         # For standard processors that are defined here, `**cross_attention_kwargs` is empty
@@ -313,8 +319,12 @@ def __call__(
                     raise ValueError(f"two attention biases have been supplied: `attention_mask` and `encoder_attention_bias`. expected a maximum of one source of bias.")
                 attention_mask = encoder_attention_bias
                 # make broadcastable over query tokens
-                # TODO: consider aligning implementations such that AttnProcessor2_0 and CrossAttnProcessor do unsqueeze
-                #       in the same way/circumstances -- AttnProcessor2_0 does it for `attention_mask` **and** for `encoder_attention_bias`.
+                # TODO: see if there's a satisfactory way to unify how the `attention_mask`/`encoder_attention_bias` code paths
+                #       create this singleton dim. the way AttnProcessor2_0 does it could work.
+                # here I'm trying to avoid interfering with the original `attention_mask` code path,
+                # by limiting the unsqueeze() to just the `encoder_attention_bias` path, on the basis that
+                # `attention_mask` is already working without this change.
+                # maybe it's because UNet2DConditionModel#forward unsqueeze()s `attention_mask` earlier.
                 attention_mask = attention_mask.unsqueeze(-2)
             if attn.cross_attention_norm:
                 encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
@@ -453,18 +463,39 @@ class XFormersCrossAttnProcessor:
     def __init__(self, attention_op: Optional[Callable] = None):
         self.attention_op = attention_op
 
-    def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=None, attention_mask=None):
-        batch_size, sequence_length, _ = hidden_states.shape
-
-        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
-
-        query = attn.to_q(hidden_states)
-
+    def __call__(
+        self,
+        attn: CrossAttention,
+        hidden_states: FloatTensor,
+        encoder_hidden_states: Optional[FloatTensor] = None,
+        attention_mask: Optional[FloatTensor] = None,
+        encoder_attention_bias: Optional[FloatTensor] = None,
+    ):
         if encoder_hidden_states is None:
             encoder_hidden_states = hidden_states
-        elif attn.cross_attention_norm:
-            encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+        else:
+            if encoder_attention_bias is not None:
+                if attention_mask is not None:
+                    # it's not well-defined whether `attention_mask` should be passed to self-attention, cross-attention, neither* or both.
+                    # if two sources of bias (`attention_mask`, `encoder_attention_bias`) are provided: it's likely to be a mistake.
+                    raise ValueError(f"two attention biases have been supplied: `attention_mask` and `encoder_attention_bias`. expected a maximum of one source of bias.")
+                attention_mask = encoder_attention_bias
+
+                # TODO: figure out why the original `attention_mask` code path didn't attempt broadcasting over query tokens.
+                #       it feels like this logic would be needed in that code path too.
 
+                # make broadcastable over query tokens
+                attention_mask = attention_mask.unsqueeze(-2)
+                _, query_tokens, _ = hidden_states.shape
+                # xformers doesn't broadcast for us, so we expand our singleton dimension manually
+                attention_mask = attention_mask.expand(-1, query_tokens, -1)
+            if attn.cross_attention_norm:
+                encoder_hidden_states = attn.norm_cross(encoder_hidden_states)
+        
+        batch_size, key_tokens, _ = encoder_hidden_states.shape
+        attention_mask = attn.prepare_attention_mask(attention_mask, key_tokens, batch_size)
+
+        query = attn.to_q(hidden_states)
         key = attn.to_k(encoder_hidden_states)
         value = attn.to_v(encoder_hidden_states)
 
@@ -478,10 +509,10 @@ def __call__(self, attn: CrossAttention, hidden_states, encoder_hidden_states=No
         hidden_states = hidden_states.to(query.dtype)
         hidden_states = attn.batch_to_head_dim(hidden_states)
 
-        # linear proj
-        hidden_states = attn.to_out[0](hidden_states)
-        # dropout
-        hidden_states = attn.to_out[1](hidden_states)
+        linear_proj, dropout = attn.to_out
+
+        hidden_states = linear_proj(hidden_states)
+        hidden_states = dropout(hidden_states)
         return hidden_states