Support to load Kohya-ss style LoRA file format (without restrictions)

isidentical · takuma104 · isidentical · commit bff9c84dac34 · 2023-07-18T20:44:47.000+03:00
Heavily based on huggingface#3756 by @takuma104 Co-Authored-By: Takuma Mori <takuma104@gmail.com>
diff --git a/examples/dreambooth/train_dreambooth_lora.py b/examples/dreambooth/train_dreambooth_lora.py
@@ -924,7 +924,7 @@ def load_model_hook(models, input_dir):
             else:
                 raise ValueError(f"unexpected save model: {model.__class__}")
 
-        lora_state_dict, network_alpha = LoraLoaderMixin.lora_state_dict(input_dir)
+        lora_state_dict, network_alpha, _ = LoraLoaderMixin.lora_state_dict(input_dir)
         LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alpha=network_alpha, unet=unet_)
         LoraLoaderMixin.load_lora_into_text_encoder(
             lora_state_dict, network_alpha=network_alpha, text_encoder=text_encoder_
diff --git a/examples/dreambooth/train_dreambooth_lora_sdxl.py b/examples/dreambooth/train_dreambooth_lora_sdxl.py
@@ -836,7 +836,7 @@ def load_model_hook(models, input_dir):
             else:
                 raise ValueError(f"unexpected save model: {model.__class__}")
 
-        lora_state_dict, network_alpha = LoraLoaderMixin.lora_state_dict(input_dir)
+        lora_state_dict, network_alpha, _ = LoraLoaderMixin.lora_state_dict(input_dir)
         LoraLoaderMixin.load_lora_into_unet(lora_state_dict, network_alpha=network_alpha, unet=unet_)
         LoraLoaderMixin.load_lora_into_text_encoder(
             lora_state_dict, network_alpha=network_alpha, text_encoder=text_encoder_
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
@@ -32,11 +32,11 @@
     LoRAAttnAddedKVProcessor,
     LoRAAttnProcessor,
     LoRAAttnProcessor2_0,
-    LoRALinearLayer,
     LoRAXFormersAttnProcessor,
     SlicedAttnAddedKVProcessor,
     XFormersAttnProcessor,
 )
+from .models.lora import Conv2dWithLoRA, LinearWithLoRA, LoRAConv2dLayer, LoRALinearLayer
 from .utils import (
     DIFFUSERS_CACHE,
     HF_HUB_OFFLINE,
@@ -464,6 +464,36 @@ def save_function(weights, filename):
         save_function(state_dict, os.path.join(save_directory, weight_name))
         logger.info(f"Model weights saved in {os.path.join(save_directory, weight_name)}")
 
+    def _load_lora_aux(self, state_dict, network_alpha=None):
+        lora_grouped_dict = defaultdict(dict)
+        for key, value in state_dict.items():
+            attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+            lora_grouped_dict[attn_processor_key][sub_key] = value
+
+        for key, value_dict in lora_grouped_dict.items():
+            rank = value_dict["lora.down.weight"].shape[0]
+            hidden_size = value_dict["lora.up.weight"].shape[0]
+            target_modules = [module for name, module in self.named_modules() if name == key]
+            if len(target_modules) == 0:
+                logger.warning(f"Could not find module {key} in the model. Skipping.")
+                continue
+
+            target_module = target_modules[0]
+            value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
+
+            lora = None
+            if isinstance(target_module, Conv2dWithLoRA):
+                lora = LoRAConv2dLayer(hidden_size, hidden_size, rank, network_alpha)
+            elif isinstance(target_module, LinearWithLoRA):
+                lora = LoRALinearLayer(target_module.in_features, target_module.out_features, rank, network_alpha)
+            else:
+                raise ValueError(f"Module {key} is not a Conv2dWithLoRA or LinearWithLoRA module.")
+            lora.load_state_dict(value_dict)
+            lora.to(device=self.device, dtype=self.dtype)
+
+            # install lora
+            target_module.lora_layer = lora
+
 
 class TextualInversionLoaderMixin:
     r"""
@@ -825,10 +855,18 @@ def load_lora_weights(self, pretrained_model_name_or_path_or_dict: Union[str, Di
             kwargs:
                 See [`~loaders.LoraLoaderMixin.lora_state_dict`].
         """
-        state_dict, network_alpha = self.lora_state_dict(pretrained_model_name_or_path_or_dict, **kwargs)
-        self.load_lora_into_unet(state_dict, network_alpha=network_alpha, unet=self.unet)
+        state_dict, network_alpha, (unet_state_dict_aux, te_state_dict_aux) = self.lora_state_dict(
+            pretrained_model_name_or_path_or_dict, **kwargs
+        )
+        self.load_lora_into_unet(
+            state_dict, network_alpha=network_alpha, unet=self.unet, state_dict_aux=unet_state_dict_aux
+        )
         self.load_lora_into_text_encoder(
-            state_dict, network_alpha=network_alpha, text_encoder=self.text_encoder, lora_scale=self.lora_scale
+            state_dict,
+            network_alpha=network_alpha,
+            text_encoder=self.text_encoder,
+            lora_scale=self.lora_scale,
+            state_dict_aux=te_state_dict_aux,
         )
 
     @classmethod
@@ -962,13 +1000,14 @@ def lora_state_dict(
 
         # Convert kohya-ss Style LoRA attn procs to diffusers attn procs
         network_alpha = None
+        auxilary_states = ({}, {})
         if all((k.startswith("lora_te_") or k.startswith("lora_unet_")) for k in state_dict.keys()):
-            state_dict, network_alpha = cls._convert_kohya_lora_to_diffusers(state_dict)
+            state_dict, network_alpha, auxilary_states = cls._convert_kohya_lora_to_diffusers(state_dict)
 
-        return state_dict, network_alpha
+        return state_dict, network_alpha, auxilary_states
 
     @classmethod
-    def load_lora_into_unet(cls, state_dict, network_alpha, unet):
+    def load_lora_into_unet(cls, state_dict, network_alpha, unet, aux_state_dict=None):
         """
         This will load the LoRA layers specified in `state_dict` into `unet`
 
@@ -981,6 +1020,8 @@ def load_lora_into_unet(cls, state_dict, network_alpha, unet):
                 See `LoRALinearLayer` for more details.
             unet (`UNet2DConditionModel`):
                 The UNet model to load the LoRA layers into.
+            aux_state_dict (`dict`, *optional*):
+                A dictionary containing the auxilary state (additional lora state) dict for the unet.
         """
 
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
@@ -1005,8 +1046,11 @@ def load_lora_into_unet(cls, state_dict, network_alpha, unet):
             warn_message = "You have saved the LoRA weights using the old format. To convert the old LoRA weights to the new format, you can first load them in a dictionary and then create a new dictionary like the following: `new_state_dict = {f'unet'.{module_name}: params for module_name, params in old_state_dict.items()}`."
             warnings.warn(warn_message)
 
+        if aux_state_dict:
+            unet._load_lora_aux(aux_state_dict, network_alpha=network_alpha)
+
     @classmethod
-    def load_lora_into_text_encoder(cls, state_dict, network_alpha, text_encoder, lora_scale=1.0):
+    def load_lora_into_text_encoder(cls, state_dict, network_alpha, text_encoder, lora_scale=1.0, state_dict_aux=None):
         """
         This will load the LoRA layers specified in `state_dict` into `text_encoder`
 
@@ -1021,6 +1065,8 @@ def load_lora_into_text_encoder(cls, state_dict, network_alpha, text_encoder, lo
             lora_scale (`float`):
                 How much to scale the output of the lora linear layer before it is added with the output of the regular
                 lora layer.
+            state_dict_aux (`dict`, *optional*):
+                A dictionary containing the auxilary state dict (additional lora state) for the text encoder.
         """
 
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
@@ -1078,6 +1124,8 @@ def load_lora_into_text_encoder(cls, state_dict, network_alpha, text_encoder, lo
                 ].shape[1]
 
                 cls._modify_text_encoder(text_encoder, lora_scale, network_alpha, rank=rank)
+                if state_dict_aux:
+                    cls._load_lora_aux_for_text_encoder(text_encoder, state_dict_aux, network_alpha=network_alpha)
 
                 # set correct dtype & device
                 text_encoder_lora_state_dict = {
@@ -1109,6 +1157,37 @@ def _remove_text_encoder_monkey_patch_classmethod(cls, text_encoder):
                 attn_module.v_proj = attn_module.v_proj.regular_linear_layer
                 attn_module.out_proj = attn_module.out_proj.regular_linear_layer
 
+    @classmethod
+    def _load_lora_aux_for_text_encoder(cls, text_encoder, state_dict, network_alpha=None):
+        lora_grouped_dict = defaultdict(dict)
+        for key, value in state_dict.items():
+            attn_processor_key, sub_key = ".".join(key.split(".")[:-3]), ".".join(key.split(".")[-3:])
+            lora_grouped_dict[attn_processor_key][sub_key] = value
+
+        for key, value_dict in lora_grouped_dict.items():
+            rank = value_dict["lora.down.weight"].shape[0]
+            target_modules = [module for name, module in text_encoder.named_modules() if name == key]
+            if len(target_modules) == 0:
+                logger.warning(f"Could not find module {key} in the model. Skipping.")
+                continue
+
+            target_module = target_modules[0]
+            value_dict = {k.replace("lora.", ""): v for k, v in value_dict.items()}
+            lora_layer = LoRALinearLayer(target_module.in_features, target_module.out_features, rank, network_alpha)
+            lora_layer.load_state_dict(value_dict)
+            lora_layer.to(device=text_encoder.device, dtype=text_encoder.dtype)
+
+            old_forward = target_module.forward
+
+            def make_new_forward(old_forward, lora_layer):
+                def new_forward(x):
+                    return old_forward(x) + lora_layer(x)
+
+                return new_forward
+
+            # Monkey-patch.
+            target_module.forward = make_new_forward(old_forward, lora_layer)
+
     @classmethod
     def _modify_text_encoder(cls, text_encoder, lora_scale=1, network_alpha=None, rank=4, dtype=None):
         r"""
@@ -1225,6 +1304,8 @@ def save_function(weights, filename):
     def _convert_kohya_lora_to_diffusers(cls, state_dict):
         unet_state_dict = {}
         te_state_dict = {}
+        unet_state_dict_aux = {}
+        te_state_dict_aux = {}
         network_alpha = None
 
         for key, value in state_dict.items():
@@ -1249,12 +1330,20 @@ def _convert_kohya_lora_to_diffusers(cls, state_dict):
                     diffusers_name = diffusers_name.replace("to.k.lora", "to_k_lora")
                     diffusers_name = diffusers_name.replace("to.v.lora", "to_v_lora")
                     diffusers_name = diffusers_name.replace("to.out.0.lora", "to_out_lora")
+                    diffusers_name = diffusers_name.replace("proj.in", "proj_in")
+                    diffusers_name = diffusers_name.replace("proj.out", "proj_out")
                     if "transformer_blocks" in diffusers_name:
                         if "attn1" in diffusers_name or "attn2" in diffusers_name:
                             diffusers_name = diffusers_name.replace("attn1", "attn1.processor")
                             diffusers_name = diffusers_name.replace("attn2", "attn2.processor")
                             unet_state_dict[diffusers_name] = value
                             unet_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
+                        elif "ff" in diffusers_name:
+                            unet_state_dict_aux[diffusers_name] = value
+                            unet_state_dict_aux[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
+                    elif any(key in diffusers_name for key in ("proj_in", "proj_out")):
+                        unet_state_dict_aux[diffusers_name] = value
+                        unet_state_dict_aux[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
                 elif lora_name.startswith("lora_te_"):
                     diffusers_name = key.replace("lora_te_", "").replace("_", ".")
                     diffusers_name = diffusers_name.replace("text.model", "text_model")
@@ -1266,11 +1355,14 @@ def _convert_kohya_lora_to_diffusers(cls, state_dict):
                     if "self_attn" in diffusers_name:
                         te_state_dict[diffusers_name] = value
                         te_state_dict[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
+                    elif "mlp" in diffusers_name:
+                        te_state_dict_aux[diffusers_name] = value
+                        te_state_dict_aux[diffusers_name.replace(".down.", ".up.")] = state_dict[lora_name_up]
 
         unet_state_dict = {f"{UNET_NAME}.{module_name}": params for module_name, params in unet_state_dict.items()}
         te_state_dict = {f"{TEXT_ENCODER_NAME}.{module_name}": params for module_name, params in te_state_dict.items()}
         new_state_dict = {**unet_state_dict, **te_state_dict}
-        return new_state_dict, network_alpha
+        return new_state_dict, network_alpha, (unet_state_dict_aux, te_state_dict_aux)
 
     def unload_lora_weights(self):
         """
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
@@ -21,6 +21,7 @@
 from .activations import get_activation
 from .attention_processor import Attention
 from .embeddings import CombinedTimestepLabelEmbeddings
+from .lora import LinearWithLoRA
 
 
 @maybe_allow_in_graph
@@ -245,7 +246,7 @@ def __init__(
         # project dropout
         self.net.append(nn.Dropout(dropout))
         # project out
-        self.net.append(nn.Linear(inner_dim, dim_out))
+        self.net.append(LinearWithLoRA(inner_dim, dim_out))
         # FF as used in Vision Transformer, MLP-Mixer, etc. have a final dropout
         if final_dropout:
             self.net.append(nn.Dropout(dropout))
@@ -289,7 +290,7 @@ class GEGLU(nn.Module):
 
     def __init__(self, dim_in: int, dim_out: int):
         super().__init__()
-        self.proj = nn.Linear(dim_in, dim_out * 2)
+        self.proj = LinearWithLoRA(dim_in, dim_out * 2)
 
     def gelu(self, gate):
         if gate.device.type != "mps":
diff --git a/src/diffusers/models/attention_processor.py b/src/diffusers/models/attention_processor.py
@@ -19,6 +19,7 @@
 
 from ..utils import deprecate, logging, maybe_allow_in_graph
 from ..utils.import_utils import is_xformers_available
+from .lora import LoRALinearLayer
 
 
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -505,36 +506,6 @@ def __call__(
         return hidden_states
 
 
-class LoRALinearLayer(nn.Module):
-    def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
-        super().__init__()
-
-        if rank > min(in_features, out_features):
-            raise ValueError(f"LoRA rank {rank} must be less or equal than {min(in_features, out_features)}")
-
-        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
-        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
-        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
-        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
-        self.network_alpha = network_alpha
-        self.rank = rank
-
-        nn.init.normal_(self.down.weight, std=1 / rank)
-        nn.init.zeros_(self.up.weight)
-
-    def forward(self, hidden_states):
-        orig_dtype = hidden_states.dtype
-        dtype = self.down.weight.dtype
-
-        down_hidden_states = self.down(hidden_states.to(dtype))
-        up_hidden_states = self.up(down_hidden_states)
-
-        if self.network_alpha is not None:
-            up_hidden_states *= self.network_alpha / self.rank
-
-        return up_hidden_states.to(orig_dtype)
-
-
 class LoRAAttnProcessor(nn.Module):
     r"""
     Processor for implementing the LoRA attention mechanism.
diff --git a/src/diffusers/models/lora.py b/src/diffusers/models/lora.py
diff --git a/src/diffusers/models/transformer_2d.py b/src/diffusers/models/transformer_2d.py