use _from_deprecated_attn_block check re: @patrickvonplaten

williamberman · williamberman · commit 84a7d3df39db · 2023-05-11T10:50:41.000-07:00
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -68,6 +68,7 @@ def __init__(
         eps: float = 1e-5,
         rescale_output_factor: float = 1.0,
         residual_connection: bool = False,
+        _from_deprecated_attn_block=False,
         processor: Optional["AttnProcessor"] = None,
     ):
         super().__init__()
@@ -78,6 +79,10 @@ def __init__(
         self.rescale_output_factor = rescale_output_factor
         self.residual_connection = residual_connection
 
+        # we make use of this private variable to know whether this class is loaded
+        # with an deprecated state dict so that we can convert it on the fly
+        self._from_deprecated_attn_block = _from_deprecated_attn_block
+
         self.scale_qk = scale_qk
         self.scale = dim_head**-0.5 if self.scale_qk else 1.0
 
diff --git a/src/diffusers/models/modeling_utils.py b/src/diffusers/models/modeling_utils.py
@@ -583,7 +583,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 if device_map is None:
                     param_device = "cpu"
                     state_dict = load_state_dict(model_file, variant=variant)
-                    cls.convert_deprecated_attention_blocks(state_dict)
+                    model._convert_deprecated_attention_blocks(state_dict)
                     # move the params from meta device to cpu
                     missing_keys = set(model.state_dict().keys()) - set(state_dict.keys())
                     if len(missing_keys) > 0:
@@ -626,7 +626,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 model = cls.from_config(config, **unused_kwargs)
 
                 state_dict = load_state_dict(model_file, variant=variant)
-                cls.convert_deprecated_attention_blocks(state_dict)
+                model._convert_deprecated_attention_blocks(state_dict)
 
                 model, missing_keys, unexpected_keys, mismatched_keys, error_msgs = cls._load_pretrained_model(
                     model,
@@ -763,42 +763,6 @@ def _find_mismatched_keys(
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
-    @classmethod
-    def convert_deprecated_attention_blocks(cls, state_dict):
-        # We check for the deprecated attention block via the `proj_attn` layer in the state dict
-        # The only other class with a layer called `proj_attn` is in `SelfAttention1d` which is used
-        # by only the top level model, `UNet1DModel`. Since, `UNet1DModel` wont have any of the deprecated
-        # attention blocks, we can just early return.
-        if cls.__name__ == "UNet1DModel":
-            return
-
-        deprecated_attention_block_paths = []
-
-        for k in state_dict.keys():
-            if "proj_attn.weight" in k:
-                index = k.index("proj_attn.weight")
-                path = k[: index - 1]
-                deprecated_attention_block_paths.append(path)
-
-        for path in deprecated_attention_block_paths:
-            # group_norm path stays the same
-
-            # query -> to_q
-            state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
-            state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
-
-            # key -> to_k
-            state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
-            state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
-
-            # value -> to_v
-            state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
-            state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
-
-            # proj_attn -> to_out.0
-            state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
-            state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
-
     @property
     def device(self) -> device:
         """
@@ -841,3 +805,34 @@ def num_parameters(self, only_trainable: bool = False, exclude_embeddings: bool
             return sum(p.numel() for p in non_embedding_parameters if p.requires_grad or not only_trainable)
         else:
             return sum(p.numel() for p in self.parameters() if p.requires_grad or not only_trainable)
+
+    def _convert_deprecated_attention_blocks(self, state_dict):
+        deprecated_attention_block_paths = []
+
+        def recursive_find_attn_block(name, module):
+            if hasattr(module, "_from_deprecated_attn_block") and module._from_deprecated_attn_block:
+                deprecated_attention_block_paths.append(name)
+            for sub_name, sub_module in module.named_children():
+                sub_name = sub_name if name == "" else f"{name}.{sub_name}"
+                recursive_find_attn_block(sub_name, sub_module)
+
+        recursive_find_attn_block("", self)
+
+        for path in deprecated_attention_block_paths:
+            # group_norm path stays the same
+
+            # query -> to_q
+            state_dict[f"{path}.to_q.weight"] = state_dict.pop(f"{path}.query.weight")
+            state_dict[f"{path}.to_q.bias"] = state_dict.pop(f"{path}.query.bias")
+
+            # key -> to_k
+            state_dict[f"{path}.to_k.weight"] = state_dict.pop(f"{path}.key.weight")
+            state_dict[f"{path}.to_k.bias"] = state_dict.pop(f"{path}.key.bias")
+
+            # value -> to_v
+            state_dict[f"{path}.to_v.weight"] = state_dict.pop(f"{path}.value.weight")
+            state_dict[f"{path}.to_v.bias"] = state_dict.pop(f"{path}.value.bias")
+
+            # proj_attn -> to_out.0
+            state_dict[f"{path}.to_out.0.weight"] = state_dict.pop(f"{path}.proj_attn.weight")
+            state_dict[f"{path}.to_out.0.bias"] = state_dict.pop(f"{path}.proj_attn.bias")
diff --git a/src/diffusers/models/unet_2d_blocks.py b/src/diffusers/models/unet_2d_blocks.py
@@ -437,6 +437,7 @@ def __init__(
                         residual_connection=True,
                         bias=True,
                         upcast_softmax=True,
+                        _from_deprecated_attn_block=True,
                     )
                 )
             else:
@@ -725,6 +726,7 @@ def __init__(
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
                 )
             )
 
@@ -1078,6 +1080,7 @@ def __init__(
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
                 )
             )
 
@@ -1156,6 +1159,7 @@ def __init__(
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
                 )
             )
 
@@ -1730,6 +1734,7 @@ def __init__(
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
                 )
             )
 
@@ -2068,6 +2073,7 @@ def __init__(
                     residual_connection=True,
                     bias=True,
                     upcast_softmax=True,
+                    _from_deprecated_attn_block=True,
                 )
             )
 
@@ -2144,6 +2150,7 @@ def __init__(
                 residual_connection=True,
                 bias=True,
                 upcast_softmax=True,
+                _from_deprecated_attn_block=True,
             )
         )
 

Original file line number	Diff line number	Diff line change
`@@ -437,6 +437,7 @@ def __init__(`
`437`	`437`	`residual_connection=True,`
`438`	`438`	`bias=True,`
`439`	`439`	`upcast_softmax=True,`
	`440`	`+ _from_deprecated_attn_block=True,`
`440`	`441`	`)`
`441`	`442`	`)`
`442`	`443`	`else:`
`@@ -725,6 +726,7 @@ def __init__(`
`725`	`726`	`residual_connection=True,`
`726`	`727`	`bias=True,`
`727`	`728`	`upcast_softmax=True,`
	`729`	`+ _from_deprecated_attn_block=True,`
`728`	`730`	`)`
`729`	`731`	`)`
`730`	`732`
`@@ -1078,6 +1080,7 @@ def __init__(`
`1078`	`1080`	`residual_connection=True,`
`1079`	`1081`	`bias=True,`
`1080`	`1082`	`upcast_softmax=True,`
	`1083`	`+ _from_deprecated_attn_block=True,`
`1081`	`1084`	`)`
`1082`	`1085`	`)`
`1083`	`1086`
`@@ -1156,6 +1159,7 @@ def __init__(`
`1156`	`1159`	`residual_connection=True,`
`1157`	`1160`	`bias=True,`
`1158`	`1161`	`upcast_softmax=True,`
	`1162`	`+ _from_deprecated_attn_block=True,`
`1159`	`1163`	`)`
`1160`	`1164`	`)`
`1161`	`1165`
`@@ -1730,6 +1734,7 @@ def __init__(`
`1730`	`1734`	`residual_connection=True,`
`1731`	`1735`	`bias=True,`
`1732`	`1736`	`upcast_softmax=True,`
	`1737`	`+ _from_deprecated_attn_block=True,`
`1733`	`1738`	`)`
`1734`	`1739`	`)`
`1735`	`1740`
`@@ -2068,6 +2073,7 @@ def __init__(`
`2068`	`2073`	`residual_connection=True,`
`2069`	`2074`	`bias=True,`
`2070`	`2075`	`upcast_softmax=True,`
	`2076`	`+ _from_deprecated_attn_block=True,`
`2071`	`2077`	`)`
`2072`	`2078`	`)`
`2073`	`2079`
`@@ -2144,6 +2150,7 @@ def __init__(`
`2144`	`2150`	`residual_connection=True,`
`2145`	`2151`	`bias=True,`
`2146`	`2152`	`upcast_softmax=True,`
	`2153`	`+ _from_deprecated_attn_block=True,`
`2147`	`2154`	`)`
`2148`	`2155`	`)`
`2149`	`2156`