Skip to content

Commit 3be9fa9

Browse files
[Accelerate model loading] Fix meta device and super low memory usage (#1016)
* [Accelerate model loading] Fix meta device and super low memory usage * better naming
1 parent e92a603 commit 3be9fa9

File tree

3 files changed

+39
-75
lines changed

3 files changed

+39
-75
lines changed

src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -119,14 +119,13 @@ def disable_attention_slicing(self):
119119
# set slice_size = `None` to disable `attention slicing`
120120
self.enable_attention_slicing(None)
121121

122-
def cuda_with_minimal_gpu_usage(self):
122+
def enable_sequential_cpu_offload(self):
123123
if is_accelerate_available():
124124
from accelerate import cpu_offload
125125
else:
126126
raise ImportError("Please install accelerate via `pip install accelerate`")
127127

128128
device = torch.device("cuda")
129-
self.enable_attention_slicing(1)
130129

131130
for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae, self.safety_checker]:
132131
cpu_offload(cpu_offloaded_model, device)

tests/pipelines/stable_diffusion/test_stable_diffusion.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515

1616
import gc
1717
import random
18+
import time
1819
import unittest
1920

2021
import numpy as np
@@ -730,3 +731,39 @@ def test_callback_fn(step: int, timestep: int, latents: torch.FloatTensor) -> No
730731
)
731732
assert test_callback_fn.has_been_called
732733
assert number_of_steps == 51
734+
735+
def test_stable_diffusion_accelerate_auto_device(self):
736+
pipeline_id = "CompVis/stable-diffusion-v1-4"
737+
738+
start_time = time.time()
739+
pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
740+
pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True
741+
)
742+
pipeline_normal_load.to(torch_device)
743+
normal_load_time = time.time() - start_time
744+
745+
start_time = time.time()
746+
_ = StableDiffusionPipeline.from_pretrained(
747+
pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
748+
)
749+
meta_device_load_time = time.time() - start_time
750+
751+
assert 2 * meta_device_load_time < normal_load_time
752+
753+
@unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
754+
def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
755+
torch.cuda.empty_cache()
756+
torch.cuda.reset_max_memory_allocated()
757+
758+
pipeline_id = "CompVis/stable-diffusion-v1-4"
759+
prompt = "Andromeda galaxy in a bottle"
760+
761+
pipeline = StableDiffusionPipeline.from_pretrained(pipeline_id, revision="fp16", torch_dtype=torch.float16)
762+
pipeline.enable_attention_slicing(1)
763+
pipeline.enable_sequential_cpu_offload()
764+
765+
_ = pipeline(prompt, num_inference_steps=5)
766+
767+
mem_bytes = torch.cuda.max_memory_allocated()
768+
# make sure that less than 1.5 GB is allocated
769+
assert mem_bytes < 1.5 * 10**9

tests/test_pipelines.py

Lines changed: 1 addition & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,12 @@
1717
import os
1818
import random
1919
import tempfile
20-
import tracemalloc
2120
import unittest
2221

2322
import numpy as np
2423
import torch
2524

26-
import accelerate
2725
import PIL
28-
import transformers
2926
from diffusers import (
3027
AutoencoderKL,
3128
DDIMPipeline,
@@ -44,8 +41,7 @@
4441
from diffusers.pipeline_utils import DiffusionPipeline
4542
from diffusers.schedulers.scheduling_utils import SCHEDULER_CONFIG_NAME
4643
from diffusers.utils import CONFIG_NAME, WEIGHTS_NAME, floats_tensor, slow, torch_device
47-
from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir, require_torch_gpu
48-
from packaging import version
44+
from diffusers.utils.testing_utils import CaptureLogger, get_tests_dir
4945
from PIL import Image
5046
from transformers import CLIPFeatureExtractor, CLIPModel, CLIPTextConfig, CLIPTextModel, CLIPTokenizer
5147

@@ -487,71 +483,3 @@ def test_ddpm_ddim_equality_batched(self):
487483

488484
# the values aren't exactly equal, but the images look the same visually
489485
assert np.abs(ddpm_images - ddim_images).max() < 1e-1
490-
491-
@require_torch_gpu
492-
def test_stable_diffusion_accelerate_load_works(self):
493-
if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
494-
return
495-
496-
if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"):
497-
return
498-
499-
model_id = "CompVis/stable-diffusion-v1-4"
500-
_ = StableDiffusionPipeline.from_pretrained(
501-
model_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
502-
).to(torch_device)
503-
504-
@require_torch_gpu
505-
def test_stable_diffusion_accelerate_load_reduces_memory_footprint(self):
506-
if version.parse(version.parse(transformers.__version__).base_version) < version.parse("4.23"):
507-
return
508-
509-
if version.parse(version.parse(accelerate.__version__).base_version) < version.parse("0.14"):
510-
return
511-
512-
pipeline_id = "CompVis/stable-diffusion-v1-4"
513-
514-
torch.cuda.empty_cache()
515-
gc.collect()
516-
517-
tracemalloc.start()
518-
pipeline_normal_load = StableDiffusionPipeline.from_pretrained(
519-
pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True
520-
)
521-
pipeline_normal_load.to(torch_device)
522-
_, peak_normal = tracemalloc.get_traced_memory()
523-
tracemalloc.stop()
524-
525-
del pipeline_normal_load
526-
torch.cuda.empty_cache()
527-
gc.collect()
528-
529-
tracemalloc.start()
530-
_ = StableDiffusionPipeline.from_pretrained(
531-
pipeline_id, revision="fp16", torch_dtype=torch.float16, use_auth_token=True, device_map="auto"
532-
)
533-
_, peak_accelerate = tracemalloc.get_traced_memory()
534-
535-
tracemalloc.stop()
536-
537-
assert peak_accelerate < peak_normal
538-
539-
@slow
540-
@unittest.skipIf(torch_device == "cpu", "This test is supposed to run on GPU")
541-
def test_stable_diffusion_pipeline_with_unet_on_gpu_only(self):
542-
torch.cuda.empty_cache()
543-
torch.cuda.reset_max_memory_allocated()
544-
545-
pipeline_id = "CompVis/stable-diffusion-v1-4"
546-
prompt = "Andromeda galaxy in a bottle"
547-
548-
pipeline = StableDiffusionPipeline.from_pretrained(
549-
pipeline_id, revision="fp16", torch_dtype=torch.float32, use_auth_token=True
550-
)
551-
pipeline.cuda_with_minimal_gpu_usage()
552-
553-
_ = pipeline(prompt)
554-
555-
mem_bytes = torch.cuda.max_memory_allocated()
556-
# make sure that less than 0.8 GB is allocated
557-
assert mem_bytes < 0.8 * 10**9

0 commit comments

Comments
 (0)