Merge pull request #2 from huggingface/smangrul/unet-enhancements

younesbelkada · web-flow · commit 86bd6f50f4fe · 2023-10-04T17:50:50.000+02:00
Smangrul/unet enhancements
diff --git a/src/diffusers/loaders.py b/src/diffusers/loaders.py
@@ -679,6 +679,55 @@ def _unfuse_lora_apply(self, module):
         if hasattr(module, "_unfuse_lora"):
             module._unfuse_lora()
 
+    def set_adapters(
+        self,
+        adapter_names: Union[List[str], str],
+        weights: List[float] = None,
+    ):
+        """
+        Sets the adapter layers for the unet.
+
+        Args:
+            adapter_names (`List[str]` or `str`):
+                The names of the adapters to use.
+            weights (`List[float]`, *optional*):
+                The weights to use for the unet. If `None`, the weights are set to `1.0` for all the adapters.
+        """
+        if not self.use_peft_backend:
+            raise ValueError("PEFT backend is required for this method.")
+
+        def process_weights(adapter_names, weights):
+            if weights is None:
+                weights = [1.0] * len(adapter_names)
+            elif isinstance(weights, float):
+                weights = [weights]
+
+            if len(adapter_names) != len(weights):
+                raise ValueError(
+                    f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
+                )
+            return weights
+
+        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
+        weights = process_weights(adapter_names, weights)
+        set_weights_and_activate_adapters(self, adapter_names, weights)
+
+    def disable_lora(self):
+        """
+        Disables the LoRA layers for the unet.
+        """
+        if not self.use_peft_backend:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=False)
+
+    def enable_lora(self):
+        """
+        Enables the LoRA layers for the unet.
+        """
+        if not self.use_peft_backend:
+            raise ValueError("PEFT backend is required for this method.")
+        set_adapter_layers(self, enabled=True)
+
 
 def load_textual_inversion_state_dicts(pretrained_model_name_or_paths, **kwargs):
     cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
@@ -1448,7 +1497,7 @@ def _maybe_map_sgm_blocks_to_diffusers(cls, state_dict, unet_config, delimiter="
 
     @classmethod
     def load_lora_into_unet(
-        cls, state_dict, network_alphas, unet, low_cpu_mem_usage=None, _pipeline=None, adapter_name="default"
+        cls, state_dict, network_alphas, unet, low_cpu_mem_usage=None, _pipeline=None, adapter_name=None
     ):
         """
         This will load the LoRA layers specified in `state_dict` into `unet`.
@@ -1468,7 +1517,8 @@ def load_lora_into_unet(
                 Only supported for PyTorch >= 1.9.0. If you are using an older version of PyTorch, setting this
                 argument to `True` will raise an error.
             adapter_name (`str`, *optional*):
-                The name of the adapter to load the weights into. By default we use `"default"`
+                Adapter name to be used for referencing the loaded adapter model. If not specified, it will use
+                `default_{i}` where i is the total number of adapters being loaded.
         """
         low_cpu_mem_usage = low_cpu_mem_usage if low_cpu_mem_usage is not None else _LOW_CPU_MEM_USAGE_DEFAULT
         # If the serialization format is new (introduced in https://github.com/huggingface/diffusers/pull/2918),
@@ -1500,38 +1550,19 @@ def load_lora_into_unet(
 
             state_dict = convert_unet_state_dict_to_peft(state_dict)
 
-            target_modules = []
-            ranks = []
+            rank = {}
             for key in state_dict.keys():
-                # filter out the name
-                filtered_name = ".".join(key.split(".")[:-2])
-                target_modules.append(filtered_name)
                 if "lora_B" in key:
-                    rank = state_dict[key].shape[1]
-                    ranks.append(rank)
+                    rank[key] = state_dict[key].shape[1]
 
-            current_rank = ranks[0]
-            if not all(rank == current_rank for rank in ranks):
-                raise ValueError("Multi-rank not supported yet")
+            lora_config_kwargs = get_peft_kwargs(rank, network_alphas, state_dict)
+            lora_config = LoraConfig(**lora_config_kwargs)
 
-            if network_alphas is not None:
-                alphas = set(network_alphas.values())
-                if len(alphas) == 1:
-                    alpha = alphas.pop()
-                # TODO: support multi-alpha
-                else:
-                    raise ValueError("Multi-alpha not supported yet")
-            else:
-                alpha = current_rank
-
-            lora_config = LoraConfig(
-                r=current_rank,
-                lora_alpha=alpha,
-                target_modules=target_modules,
-            )
+            # adapter_name
+            if adapter_name is None:
+                adapter_name = get_adapter_name(unet)
 
             inject_adapter_in_model(lora_config, unet, adapter_name=adapter_name)
-
             incompatible_keys = set_peft_model_state_dict(unet, state_dict, adapter_name)
 
             if incompatible_keys is not None:
@@ -1655,12 +1686,14 @@ def load_lora_into_text_encoder(
                     if adapter_name is None:
                         adapter_name = get_adapter_name(text_encoder)
 
+
                     # inject LoRA layers and load the state dict
                     text_encoder.load_adapter(
                         adapter_name=adapter_name,
                         adapter_state_dict=text_encoder_lora_state_dict,
                         peft_config=lora_config,
                     )
+                    
                     # scale LoRA layers with `lora_scale`
                     scale_lora_layers(text_encoder, weight=lora_scale)
 
@@ -2258,7 +2291,7 @@ def unfuse_text_encoder_lora(text_encoder):
 
         self.num_fused_loras -= 1
 
-    def set_adapter_for_text_encoder(
+    def set_adapters_for_text_encoder(
         self,
         adapter_names: Union[List[str], str],
         text_encoder: Optional[PreTrainedModel] = None,
@@ -2336,60 +2369,44 @@ def enable_lora_for_text_encoder(self, text_encoder: Optional[PreTrainedModel] =
     def set_adapters(
         self,
         adapter_names: Union[List[str], str],
-        weights: List[float] = None,
+        unet_weights: List[float] = None,
+        te_weights: List[float] = None,
+        te2_weights: List[float] = None,
     ):
-        """
-        Sets the adapter layers for the unet.
-
-        Args:
-            adapter_names (`List[str]` or `str`):
-                The names of the adapters to use.
-            weights (`List[float]`, *optional*):
-                The weights to use for the unet. If `None`, the weights are set to `1.0` for all the adapters.
-        """
-        if not self.use_peft_backend:
-            raise ValueError("PEFT backend is required for this method.")
-
-        def process_weights(adapter_names, weights):
-            if weights is None:
-                weights = [1.0] * len(adapter_names)
-            elif isinstance(weights, float):
-                weights = [weights]
-
-            if len(adapter_names) != len(weights):
-                raise ValueError(
-                    f"Length of adapter names {len(adapter_names)} is not equal to the length of the weights {len(weights)}"
-                )
-            return weights
-
-        adapter_names = [adapter_names] if isinstance(adapter_names, str) else adapter_names
-        weights = process_weights(adapter_names, weights)
+        # Handle the UNET
+        self.unet.set_adapters(adapter_names, unet_weights)
 
-        for key, value in self.components.items():
-            if isinstance(value, nn.Module):
-                set_weights_and_activate_adapters(value, adapter_names, weights)
+        # Handle the Text Encoder
+        if hasattr(self, "text_encoder"):
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder, te_weights)
+        if hasattr(self, "text_encoder_2"):
+            self.set_adapters_for_text_encoder(adapter_names, self.text_encoder_2, te2_weights)
 
     def disable_lora(self):
-        """
-        Disables the LoRA layers for the unet.
-        """
         if not self.use_peft_backend:
             raise ValueError("PEFT backend is required for this method.")
 
-        for key, value in self.components.items():
-            if isinstance(value, nn.Module):
-                set_adapter_layers(value, enabled=False)
+        # Disable unet adapters
+        self.unet.disable_lora()
+
+        # Disable text encoder adapters
+        if hasattr(self, "text_encoder"):
+            self.disable_lora_for_text_encoder(self.text_encoder)
+        if hasattr(self, "text_encoder_2"):
+            self.disable_lora_for_text_encoder(self.text_encoder_2)
 
     def enable_lora(self):
-        """
-        Enables the LoRA layers for the unet.
-        """
         if not self.use_peft_backend:
             raise ValueError("PEFT backend is required for this method.")
 
-        for key, value in self.components.items():
-            if isinstance(value, nn.Module):
-                set_adapter_layers(value, enabled=True)
+        # Enable unet adapters
+        self.unet.enable_lora()
+
+        # Enable text encoder adapters
+        if hasattr(self, "text_encoder"):
+            self.enable_lora_for_text_encoder(self.text_encoder)
+        if hasattr(self, "text_encoder_2"):
+            self.enable_lora_for_text_encoder(self.text_encoder_2)
 
 
 class FromSingleFileMixin:
diff --git a/src/diffusers/utils/peft_utils.py b/src/diffusers/utils/peft_utils.py
@@ -123,13 +123,16 @@ def get_peft_kwargs(rank_dict, network_alpha_dict, peft_state_dict):
         rank_pattern = dict(filter(lambda x: x[1] != r, rank_dict.items()))
         rank_pattern = {k.split(".lora_B.")[0]: v for k, v in rank_pattern.items()}
 
-    if network_alpha_dict is not None and len(set(network_alpha_dict.values())) > 1:
-        # get the alpha occuring the most number of times
-        lora_alpha = collections.Counter(network_alpha_dict.values()).most_common()[0][0]
-
-        # for modules with alpha different from the most occuring alpha, add it to the `alpha_pattern`
-        alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, network_alpha_dict.items()))
-        alpha_pattern = {".".join(k.split(".down.")[0].split(".")[:-1]): v for k, v in alpha_pattern.items()}
+    if network_alpha_dict is not None:
+        if len(set(network_alpha_dict.values())) > 1:
+            # get the alpha occuring the most number of times
+            lora_alpha = collections.Counter(network_alpha_dict.values()).most_common()[0][0]
+
+            # for modules with alpha different from the most occuring alpha, add it to the `alpha_pattern`
+            alpha_pattern = dict(filter(lambda x: x[1] != lora_alpha, network_alpha_dict.items()))
+            alpha_pattern = {".".join(k.split(".down.")[0].split(".")[:-1]): v for k, v in alpha_pattern.items()}
+        else:
+            lora_alpha = set(network_alpha_dict.values()).pop()
 
     # layer names without the Diffusers specific
     target_modules = list({name.split(".lora")[0] for name in peft_state_dict.keys()})