move to cpu_offload along with minor internal changes to make it work

piEsposito · piEsposito · commit 965dfe100570 · 2022-10-20T14:45:46.000-03:00
diff --git a/setup.py b/setup.py
@@ -79,7 +79,7 @@
 # 2. once modified, run: `make deps_table_update` to update src/diffusers/dependency_versions_table.py
 _deps = [
     "Pillow<10.0",  # keep the PIL.Image.Resampling deprecation away
-    "accelerate>=0.11.0",
+    "accelerate>=0.14.0",
     "black==22.8",
     "datasets",
     "filelock",
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
@@ -206,6 +206,8 @@ def device(self) -> torch.device:
         for name in module_names.keys():
             module = getattr(self, name)
             if isinstance(module, torch.nn.Module):
+                if module.device == torch.device("meta"):
+                    return torch.device("cpu")
                 return module.device
         return torch.device("cpu")
 
diff --git a/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -121,20 +121,15 @@ def disable_attention_slicing(self):
 
     def cuda_with_minimal_gpu_usage(self):
         if is_accelerate_available():
-            from accelerate.hooks import attach_execution_device_hook
+            from accelerate import cpu_offload
         else:
             raise ImportError("Please install accelerate via `pip install accelerate`")
 
         device = torch.device("cuda")
-
-        self.unet.half().to(device)
-        attach_execution_device_hook(self.unet, device)
-        self.unet.forward = torch.autocast("cuda")(self.unet.forward)
         self.enable_attention_slicing(1)
 
-        for cpu_offloaded_model in [self.text_encoder, self.vae, self.safety_checker]:
-            cpu_offloaded_model.to(torch.float32)
-            attach_execution_device_hook(cpu_offloaded_model, "cpu")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
+            cpu_offload(cpu_offloaded_model, device)
 
     @torch.no_grad()
     def __call__(
@@ -310,7 +305,7 @@ def __call__(
                     self.device
                 )
             else:
-                latents = torch.randn(latents_shape, generator=generator, device=self.unet.device, dtype=latents_dtype)
+                latents = torch.randn(latents_shape, generator=generator, device=self.device, dtype=latents_dtype)
         else:
             if latents.shape != latents_shape:
                 raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
@@ -2293,5 +2293,5 @@ def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
         _ = pipeline(prompt)
 
         mem_bytes = torch.cuda.max_memory_allocated()
-        # make sure that less than 2.2 GB is allocated
-        assert mem_bytes < 2.2 * 10**9
+        # make sure that less than 0.8 GB is allocated
+        assert mem_bytes < 0.8 * 10**9