From daa06601d330244f0170da8c4d9e6db6122bee56 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 12:36:46 +0000
Subject: [PATCH 01/12] up

---
 scripts/change_configs.py | 71 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 scripts/change_configs.py

diff --git a/scripts/change_configs.py b/scripts/change_configs.py
new file mode 100644
index 000000000000..7e213bb85984
--- /dev/null
+++ b/scripts/change_configs.py
@@ -0,0 +1,71 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+import json
+import torch
+from diffusers import UNetUnconditionalModel
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    config_parameters_to_change = {
+
+    }
+
+    parser.add_argument(
+        "--config_file",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
+    )
+
+    args = parser.parse_args()
+
+    with open(args.config_file) as f:
+        config = json.loads(f.read())
+
+
+    for key, value in config_parameters_to_change.items():
+        if key in config:
+            if isinstance(value, dict):
+                new_list = []
+
+                for block_name in config[key]:
+                    # map old block name to new one
+                    new_list.append(value[block_name])
+            else:
+                config[key] = value
+
+
+    state_dict = torch.load(
+    model = UNetUnconditionalModel(**config)
+    model.load_state_dict(converted_checkpoint)
+
+    try:
+        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = LatentDiffusionUncondPipeline(unet=model, scheduler=scheduler, vae=vqvae)
+        pipe.save_pretrained(args.dump_path)
+    except:
+        model.save_pretrained(args.dump_path)

From 5b0da16a99ddfd786970b30ad9f534ffceb3e0b5 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 15:12:11 +0000
Subject: [PATCH 02/12] change model name

---
 .../convert_ncsnpp_original_checkpoint_to_diffusers.py    | 2 +-
 scripts/generate_logits.py                                | 8 +++++---
 src/diffusers/modeling_utils.py                           | 2 +-
 src/diffusers/pipeline_utils.py                           | 2 +-
 4 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
index a50b780e51e1..ee10eb3c2557 100644
--- a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
@@ -134,7 +134,7 @@ def set_resnet_weights(new_layer, old_checkpoint, index):
 
     parser.add_argument(
         "--checkpoint_path",
-        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_model.pt",
+        default="/Users/arthurzucker/Work/diffusers/ArthurZ/diffusion_pytorch_model.bin",
         type=str,
         required=False,
         help="Path to the checkpoint to convert.",
diff --git a/scripts/generate_logits.py b/scripts/generate_logits.py
index 352999f16e4c..0582322e785d 100644
--- a/scripts/generate_logits.py
+++ b/scripts/generate_logits.py
@@ -71,10 +71,12 @@
 for mod in models:
     if "google" in mod.author or mod.modelId == "CompVis/ldm-celebahq-256": 
             
-        if mod.modelId == "CompVis/ldm-celebahq-256" or not has_file(mod.modelId, "config.json"):
-            model = UNetUnconditionalModel.from_pretrained(mod.modelId, subfolder = "unet")
+        local_checkpoint = "/home/patrick/google_checkpoints/" + mod.modelId.split("/")[-1]
+
+        if mod.modelId.startswith("CompVis"):
+            model = UNetUnconditionalModel.from_pretrained(local_checkpoint, subfolder = "unet")
         else: 
-            model = UNetUnconditionalModel.from_pretrained(mod.modelId)
+            model = UNetUnconditionalModel.from_pretrained(local_checkpoint)
         
         torch.manual_seed(0)
         random.seed(0)
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 4d4bbbdd7b3d..1380f16d1df0 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -34,7 +34,7 @@
 )
 
 
-WEIGHTS_NAME = "diffusion_model.pt"
+WEIGHTS_NAME = "diffusion_pytorch_model.bin"
 
 
 logger = logging.get_logger(__name__)
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index b0ff25c33955..10ae3003bfd5 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -24,7 +24,7 @@
 from .utils import DIFFUSERS_CACHE, logging
 
 
-INDEX_FILE = "diffusion_model.pt"
+INDEX_FILE = "diffusion_pytorch_model.bin"
 
 
 logger = logging.get_logger(__name__)

From 8358101c5d300c6a231437bdba4f89740110eaf0 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 15:28:24 +0000
Subject: [PATCH 03/12] renaming

---
 README.md                                     |  4 +--
 scripts/change_configs.py                     |  4 +--
 ...t_ddpm_original_checkpoint_to_diffusers.py |  4 +--
 ...rt_ldm_original_checkpoint_to_diffusers.py |  4 +--
 ...ncsnpp_original_checkpoint_to_diffusers.py |  6 ++--
 scripts/generate_logits.py                    |  6 ++--
 src/diffusers/__init__.py                     |  2 +-
 src/diffusers/models/__init__.py              |  4 +--
 .../{unet_unconditional.py => unet_2d.py}     |  2 +-
 ...et_conditional.py => unet_2d_condition.py} |  2 +-
 tests/test_modeling_utils.py                  | 34 +++++++++----------
 11 files changed, 36 insertions(+), 36 deletions(-)
 rename src/diffusers/models/{unet_unconditional.py => unet_2d.py} (99%)
 rename src/diffusers/models/{unet_conditional.py => unet_2d_condition.py} (99%)

diff --git a/README.md b/README.md
index 6f8d8a6eea2c..86000dd0188e 100644
--- a/README.md
+++ b/README.md
@@ -84,7 +84,7 @@ For more examples see [schedulers](https://github.com/huggingface/diffusers/tree
 
 ```python
 import torch
-from diffusers import UNetUnconditionalModel, DDIMScheduler
+from diffusers import UNet2DModel, DDIMScheduler
 import PIL.Image
 import numpy as np
 import tqdm
@@ -93,7 +93,7 @@ torch_device = "cuda" if torch.cuda.is_available() else "cpu"
 
 # 1. Load models
 scheduler = DDIMScheduler.from_config("fusing/ddpm-celeba-hq", tensor_format="pt")
-unet = UNetUnconditionalModel.from_pretrained("fusing/ddpm-celeba-hq", ddpm=True).to(torch_device)
+unet = UNet2DModel.from_pretrained("fusing/ddpm-celeba-hq", ddpm=True).to(torch_device)
 
 # 2. Sample gaussian noise
 generator = torch.manual_seed(23)
diff --git a/scripts/change_configs.py b/scripts/change_configs.py
index 7e213bb85984..49b1cef67fc1 100644
--- a/scripts/change_configs.py
+++ b/scripts/change_configs.py
@@ -17,7 +17,7 @@
 import argparse
 import json
 import torch
-from diffusers import UNetUnconditionalModel
+from diffusers import UNet2DModel
 
 
 if __name__ == "__main__":
@@ -58,7 +58,7 @@
 
 
     state_dict = torch.load(
-    model = UNetUnconditionalModel(**config)
+    model = UNet2DModel(**config)
     model.load_state_dict(converted_checkpoint)
 
     try:
diff --git a/scripts/convert_ddpm_original_checkpoint_to_diffusers.py b/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
index b1499e285de8..92b64c38ba52 100644
--- a/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
@@ -1,4 +1,4 @@
-from diffusers import UNetUnconditionalModel, DDPMScheduler, DDPMPipeline
+from diffusers import UNet2DModel, DDPMScheduler, DDPMPipeline
 import argparse
 import json
 import torch
@@ -225,7 +225,7 @@ def convert_ddpm_checkpoint(checkpoint, config):
     if "ddpm" in config:
         del config["ddpm"]
 
-    model = UNetUnconditionalModel(**config)
+    model = UNet2DModel(**config)
     model.load_state_dict(converted_checkpoint)
 
     scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
diff --git a/scripts/convert_ldm_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
index 2ec816f08c34..30dfa8310cf6 100644
--- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -17,7 +17,7 @@
 import argparse
 import json
 import torch
-from diffusers import VQModel, DDPMScheduler, UNetUnconditionalModel, LatentDiffusionUncondPipeline
+from diffusers import VQModel, DDPMScheduler, UNet2DModel, LatentDiffusionUncondPipeline
 
 
 def shave_segments(path, n_shave_prefix_segments=1):
@@ -319,7 +319,7 @@ def convert_ldm_checkpoint(checkpoint, config):
     if "ldm" in config:
         del config["ldm"]
 
-    model = UNetUnconditionalModel(**config)
+    model = UNet2DModel(**config)
     model.load_state_dict(converted_checkpoint)
 
     try:
diff --git a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
index ee10eb3c2557..ae179d5f9cbf 100644
--- a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
@@ -17,14 +17,14 @@
 import argparse
 import json
 import torch
-from diffusers import UNetUnconditionalModel
+from diffusers import UNet2DModel
 
 
 def convert_ncsnpp_checkpoint(checkpoint, config):
     """
     Takes a state dict and the path to
     """
-    new_model_architecture = UNetUnconditionalModel(**config)
+    new_model_architecture = UNet2DModel(**config)
     new_model_architecture.time_steps.W.data = checkpoint["all_modules.0.W"].data
     new_model_architecture.time_steps.weight.data = checkpoint["all_modules.0.W"].data
     new_model_architecture.time_embedding.linear_1.weight.data = checkpoint["all_modules.1.weight"].data
@@ -171,7 +171,7 @@ def set_resnet_weights(new_layer, old_checkpoint, index):
     if "sde" in config:
         del config["sde"]
 
-    model = UNetUnconditionalModel(**config)
+    model = UNet2DModel(**config)
     model.load_state_dict(converted_checkpoint)
 
     try:
diff --git a/scripts/generate_logits.py b/scripts/generate_logits.py
index 0582322e785d..93a94b7704c0 100644
--- a/scripts/generate_logits.py
+++ b/scripts/generate_logits.py
@@ -1,6 +1,6 @@
 from huggingface_hub import HfApi
 from transformers.file_utils import has_file
-from diffusers import UNetUnconditionalModel
+from diffusers import UNet2DModel
 import random
 import torch
 api = HfApi()
@@ -74,9 +74,9 @@
         local_checkpoint = "/home/patrick/google_checkpoints/" + mod.modelId.split("/")[-1]
 
         if mod.modelId.startswith("CompVis"):
-            model = UNetUnconditionalModel.from_pretrained(local_checkpoint, subfolder = "unet")
+            model = UNet2DModel.from_pretrained(local_checkpoint, subfolder = "unet")
         else: 
-            model = UNetUnconditionalModel.from_pretrained(local_checkpoint)
+            model = UNet2DModel.from_pretrained(local_checkpoint)
         
         torch.manual_seed(0)
         random.seed(0)
diff --git a/src/diffusers/__init__.py b/src/diffusers/__init__.py
index 6f50467752a3..e147a9161804 100644
--- a/src/diffusers/__init__.py
+++ b/src/diffusers/__init__.py
@@ -7,7 +7,7 @@
 __version__ = "0.0.4"
 
 from .modeling_utils import ModelMixin
-from .models import AutoencoderKL, UNetConditionalModel, UNetUnconditionalModel, VQModel
+from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
 from .pipeline_utils import DiffusionPipeline
 from .pipelines import DDIMPipeline, DDPMPipeline, LatentDiffusionUncondPipeline, PNDMPipeline, ScoreSdeVePipeline
 from .schedulers import DDIMScheduler, DDPMScheduler, PNDMScheduler, SchedulerMixin, ScoreSdeVeScheduler
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index f3b2fe9e821a..dbafcdf26ca8 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -16,6 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unet_conditional import UNetConditionalModel
-from .unet_unconditional import UNetUnconditionalModel
+from .unet_2d_condition import UNet2DConditionModel
+from .unet_2d import UNet2DModel
 from .vae import AutoencoderKL, VQModel
diff --git a/src/diffusers/models/unet_unconditional.py b/src/diffusers/models/unet_2d.py
similarity index 99%
rename from src/diffusers/models/unet_unconditional.py
rename to src/diffusers/models/unet_2d.py
index c809374a6ffb..72de88e046df 100644
--- a/src/diffusers/models/unet_unconditional.py
+++ b/src/diffusers/models/unet_2d.py
@@ -9,7 +9,7 @@
 from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
 
 
-class UNetUnconditionalModel(ModelMixin, ConfigMixin):
+class UNet2DModel(ModelMixin, ConfigMixin):
     """
     The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
     model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
diff --git a/src/diffusers/models/unet_conditional.py b/src/diffusers/models/unet_2d_condition.py
similarity index 99%
rename from src/diffusers/models/unet_conditional.py
rename to src/diffusers/models/unet_2d_condition.py
index 293542f587f0..f8bd3a120c20 100644
--- a/src/diffusers/models/unet_conditional.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -9,7 +9,7 @@
 from .unet_blocks import UNetMidBlock2DCrossAttn, get_down_block, get_up_block
 
 
-class UNetConditionalModel(ModelMixin, ConfigMixin):
+class UNet2DConditionModel(ModelMixin, ConfigMixin):
     """
     The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
     model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index dc7f125476fe..5c745896a4ec 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -23,7 +23,7 @@
 import numpy as np
 import torch
 
-from diffusers import UNetConditionalModel  # noqa: F401 TODO(Patrick) - need to write tests with it
+from diffusers import UNet2DConditionModel  # noqa: F401 TODO(Patrick) - need to write tests with it
 from diffusers import (
     AutoencoderKL,
     DDIMPipeline,
@@ -36,7 +36,7 @@
     PNDMScheduler,
     ScoreSdeVePipeline,
     ScoreSdeVeScheduler,
-    UNetUnconditionalModel,
+    UNet2DModel,
     VQModel,
 )
 from diffusers.configuration_utils import ConfigMixin, register_to_config
@@ -271,7 +271,7 @@ def test_ema_training(self):
 
 
 class UnetModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNetUnconditionalModel
+    model_class = UNet2DModel
 
     @property
     def dummy_input(self):
@@ -309,7 +309,7 @@ def prepare_init_args_and_inputs_for_common(self):
 
 #    TODO(Patrick) - Re-add this test after having correctly added the final VE checkpoints
 #    def test_output_pretrained(self):
-#        model = UNetUnconditionalModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet")
+#        model = UNet2DModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet")
 #        model.eval()
 #
 #        torch.manual_seed(0)
@@ -330,7 +330,7 @@ def prepare_init_args_and_inputs_for_common(self):
 
 
 class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNetUnconditionalModel
+    model_class = UNet2DModel
 
     @property
     def dummy_input(self):
@@ -367,7 +367,7 @@ def prepare_init_args_and_inputs_for_common(self):
         return init_dict, inputs_dict
 
     def test_from_pretrained_hub(self):
-        model, loading_info = UNetUnconditionalModel.from_pretrained(
+        model, loading_info = UNet2DModel.from_pretrained(
             "fusing/unet-ldm-dummy-update", output_loading_info=True
         )
         self.assertIsNotNone(model)
@@ -379,7 +379,7 @@ def test_from_pretrained_hub(self):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = UNetUnconditionalModel.from_pretrained("fusing/unet-ldm-dummy-update")
+        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
         model.eval()
 
         torch.manual_seed(0)
@@ -426,7 +426,7 @@ def test_output_pretrained(self):
 
 
 class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNetUnconditionalModel
+    model_class = UNet2DModel
 
     @property
     def dummy_input(self):
@@ -474,7 +474,7 @@ def prepare_init_args_and_inputs_for_common(self):
         return init_dict, inputs_dict
 
     def test_from_pretrained_hub(self):
-        model, loading_info = UNetUnconditionalModel.from_pretrained(
+        model, loading_info = UNet2DModel.from_pretrained(
             "fusing/ncsnpp-ffhq-ve-dummy-update", output_loading_info=True
         )
         self.assertIsNotNone(model)
@@ -486,7 +486,7 @@ def test_from_pretrained_hub(self):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained_ve_mid(self):
-        model = UNetUnconditionalModel.from_pretrained("google/ncsnpp-celebahq-256")
+        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -511,7 +511,7 @@ def test_output_pretrained_ve_mid(self):
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
 
     def test_output_pretrained_ve_large(self):
-        model = UNetUnconditionalModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
+        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -685,7 +685,7 @@ def test_output_pretrained(self):
 class PipelineTesterMixin(unittest.TestCase):
     def test_from_pretrained_save_pretrained(self):
         # 1. Load models
-        model = UNetUnconditionalModel(
+        model = UNet2DModel(
             block_channels=(32, 64),
             num_res_blocks=2,
             image_size=32,
@@ -732,7 +732,7 @@ def test_from_pretrained_hub(self):
     def test_ddpm_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNetUnconditionalModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDPMScheduler.from_config(model_id)
         scheduler = scheduler.set_format("pt")
 
@@ -753,7 +753,7 @@ def test_ddpm_cifar10(self):
     def test_ddim_lsun(self):
         model_id = "google/ddpm-ema-bedroom-256"
 
-        unet = UNetUnconditionalModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler.from_config(model_id)
 
         ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
@@ -773,7 +773,7 @@ def test_ddim_lsun(self):
     def test_ddim_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNetUnconditionalModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler(tensor_format="pt")
 
         ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
@@ -793,7 +793,7 @@ def test_ddim_cifar10(self):
     def test_pndm_cifar10(self):
         model_id = "google/ddpm-cifar10-32"
 
-        unet = UNetUnconditionalModel.from_pretrained(model_id)
+        unet = UNet2DModel.from_pretrained(model_id)
         scheduler = PNDMScheduler(tensor_format="pt")
 
         pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
@@ -838,7 +838,7 @@ def test_ldm_text2img_fast(self):
 
     @slow
     def test_score_sde_ve_pipeline(self):
-        model = UNetUnconditionalModel.from_pretrained("google/ncsnpp-ffhq-1024")
+        model = UNet2DModel.from_pretrained("google/ncsnpp-ffhq-1024")
 
         torch.manual_seed(0)
         if torch.cuda.is_available():

From 748ede7e79c5a4aa0d7f96a0b84a0fc9af091242 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 15:42:57 +0000
Subject: [PATCH 04/12] more changes

---
 tests/test_modeling_utils.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index c42d024b05c2..f74d4af5c3a9 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -712,7 +712,7 @@ def test_from_pretrained_save_pretrained(self):
 
     @slow
     def test_from_pretrained_hub(self):
-        model_path = "google/ddpm-cifar10-32"
+        model_path = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
 
         ddpm = DDPMPipeline.from_pretrained(model_path)
         ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)
@@ -730,7 +730,7 @@ def test_from_pretrained_hub(self):
 
     @slow
     def test_ddpm_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDPMScheduler.from_config(model_id)
@@ -749,7 +749,7 @@ def test_ddpm_cifar10(self):
 
     @slow
     def test_ddim_lsun(self):
-        model_id = "google/ddpm-ema-bedroom-256"
+        model_id = "/home/patrick/google_checkpoints/ddpm-ema-bedroom-256"
 
         unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler.from_config(model_id)
@@ -767,7 +767,7 @@ def test_ddim_lsun(self):
 
     @slow
     def test_ddim_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id)
         scheduler = DDIMScheduler(tensor_format="pt")
@@ -785,7 +785,7 @@ def test_ddim_cifar10(self):
 
     @slow
     def test_pndm_cifar10(self):
-        model_id = "google/ddpm-cifar10-32"
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
 
         unet = UNet2DModel.from_pretrained(model_id)
         scheduler = PNDMScheduler(tensor_format="pt")
@@ -802,7 +802,7 @@ def test_pndm_cifar10(self):
 
     @slow
     def test_ldm_text2img(self):
-        ldm = LatentDiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.manual_seed(0)
@@ -816,7 +816,7 @@ def test_ldm_text2img(self):
 
     @slow
     def test_ldm_text2img_fast(self):
-        ldm = LatentDiffusionPipeline.from_pretrained("CompVis/ldm-text2im-large-256")
+        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
 
         prompt = "A painting of a squirrel eating a burger"
         generator = torch.manual_seed(0)
@@ -830,13 +830,13 @@ def test_ldm_text2img_fast(self):
 
     @slow
     def test_score_sde_ve_pipeline(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-church-256")
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-church-256")
 
         torch.manual_seed(0)
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
 
-        scheduler = ScoreSdeVeScheduler.from_config("google/ncsnpp-church-256")
+        scheduler = ScoreSdeVeScheduler.from_config("/home/patrick/google_checkpoints/ncsnpp-church-256")
 
         sde_ve = ScoreSdeVePipeline(model=model, scheduler=scheduler)
 
@@ -851,7 +851,7 @@ def test_score_sde_ve_pipeline(self):
 
     @slow
     def test_ldm_uncond(self):
-        ldm = LatentDiffusionUncondPipeline.from_pretrained("CompVis/ldm-celebahq-256")
+        ldm = LatentDiffusionUncondPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-celebahq-256")
 
         generator = torch.manual_seed(0)
         image = ldm(generator=generator, num_inference_steps=5)["sample"]

From fc90b452603336cd2b2992e7cc32e22773024f7f Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 16:32:43 +0000
Subject: [PATCH 05/12] up

---
 @                                             | 861 ++++++++++++++++++
 src/diffusers/models/__init__.py              |   2 +-
 src/diffusers/schedulers/scheduling_sde_ve.py |  33 +-
 tests/test_modeling_utils.py                  |  34 +-
 4 files changed, 896 insertions(+), 34 deletions(-)
 create mode 100644 @

diff --git a/@ b/@
new file mode 100644
index 000000000000..4f932be5b4ca
--- /dev/null
+++ b/@
@@ -0,0 +1,861 @@
+# coding=utf-8
+# Copyright 2022 HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import inspect
+import math
+import tempfile
+import unittest
+from atexit import register
+
+import numpy as np
+import torch
+
+from diffusers import UNet2DConditionModel  # noqa: F401 TODO(Patrick) - need to write tests with it
+from diffusers import (
+    AutoencoderKL,
+    DDIMPipeline,
+    DDIMScheduler,
+    DDPMPipeline,
+    DDPMScheduler,
+    LatentDiffusionPipeline,
+    LatentDiffusionUncondPipeline,
+    PNDMPipeline,
+    PNDMScheduler,
+    ScoreSdeVePipeline,
+    ScoreSdeVeScheduler,
+    UNet2DModel,
+    VQModel,
+)
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.testing_utils import floats_tensor, slow, torch_device
+from diffusers.training_utils import EMAModel
+
+
+torch.backends.cuda.matmul.allow_tf32 = False
+
+
+class SampleObject(ConfigMixin):
+    config_name = "config.json"
+
+    @register_to_config
+    def __init__(
+        self,
+        a=2,
+        b=5,
+        c=(2, 5),
+        d="for diffusion",
+        e=[1, 3],
+    ):
+        pass
+
+
+class ConfigTester(unittest.TestCase):
+    def test_load_not_from_mixin(self):
+        with self.assertRaises(ValueError):
+            ConfigMixin.from_config("dummy_path")
+
+    def test_register_to_config(self):
+        obj = SampleObject()
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # init ignore private arguments
+        obj = SampleObject(_name_or_path="lalala")
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # can override default
+        obj = SampleObject(c=6)
+        config = obj.config
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == 6
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        # can use positional arguments.
+        obj = SampleObject(1, c=6)
+        config = obj.config
+        assert config["a"] == 1
+        assert config["b"] == 5
+        assert config["c"] == 6
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+    def test_save_load(self):
+        obj = SampleObject()
+        config = obj.config
+
+        assert config["a"] == 2
+        assert config["b"] == 5
+        assert config["c"] == (2, 5)
+        assert config["d"] == "for diffusion"
+        assert config["e"] == [1, 3]
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            obj.save_config(tmpdirname)
+            new_obj = SampleObject.from_config(tmpdirname)
+            new_config = new_obj.config
+
+        # unfreeze configs
+        config = dict(config)
+        new_config = dict(new_config)
+
+        assert config.pop("c") == (2, 5)  # instantiated as tuple
+        assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
+        assert config == new_config
+
+
+class ModelTesterMixin:
+    def test_from_pretrained_save_pretrained(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_pretrained(tmpdirname)
+            new_model = self.model_class.from_pretrained(tmpdirname)
+            new_model.to(torch_device)
+
+        with torch.no_grad():
+            image = model(**inputs_dict)
+            if isinstance(image, dict):
+                image = image["sample"]
+
+            new_image = new_model(**inputs_dict)
+
+            if isinstance(new_image, dict):
+                new_image = new_image["sample"]
+
+        max_diff = (image - new_image).abs().sum().item()
+        self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
+
+    def test_determinism(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+        with torch.no_grad():
+            first = model(**inputs_dict)
+            if isinstance(first, dict):
+                first = first["sample"]
+
+            second = model(**inputs_dict)
+            if isinstance(second, dict):
+                second = second["sample"]
+
+        out_1 = first.cpu().numpy()
+        out_2 = second.cpu().numpy()
+        out_1 = out_1[~np.isnan(out_1)]
+        out_2 = out_2[~np.isnan(out_2)]
+        max_diff = np.amax(np.abs(out_1 - out_2))
+        self.assertLessEqual(max_diff, 1e-5)
+
+    def test_output(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        with torch.no_grad():
+            output = model(**inputs_dict)
+
+            if isinstance(output, dict):
+                output = output["sample"]
+
+        self.assertIsNotNone(output)
+        expected_shape = inputs_dict["sample"].shape
+        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
+
+    def test_forward_signature(self):
+        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        signature = inspect.signature(model.forward)
+        # signature.parameters is an OrderedDict => so arg_names order is deterministic
+        arg_names = [*signature.parameters.keys()]
+
+        expected_arg_names = ["sample", "timestep"]
+        self.assertListEqual(arg_names[:2], expected_arg_names)
+
+    def test_model_from_config(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.eval()
+
+        # test if the model can be loaded from the config
+        # and has all the expected shape
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            model.save_config(tmpdirname)
+            new_model = self.model_class.from_config(tmpdirname)
+            new_model.to(torch_device)
+            new_model.eval()
+
+        # check if all paramters shape are the same
+        for param_name in model.state_dict().keys():
+            param_1 = model.state_dict()[param_name]
+            param_2 = new_model.state_dict()[param_name]
+            self.assertEqual(param_1.shape, param_2.shape)
+
+        with torch.no_grad():
+            output_1 = model(**inputs_dict)
+
+            if isinstance(output_1, dict):
+                output_1 = output_1["sample"]
+
+            output_2 = new_model(**inputs_dict)
+
+            if isinstance(output_2, dict):
+                output_2 = output_2["sample"]
+
+        self.assertEqual(output_1.shape, output_2.shape)
+
+    def test_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output["sample"]
+
+        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+
+    def test_ema_training(self):
+        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
+
+        model = self.model_class(**init_dict)
+        model.to(torch_device)
+        model.train()
+        ema_model = EMAModel(model, device=torch_device)
+
+        output = model(**inputs_dict)
+
+        if isinstance(output, dict):
+            output = output["sample"]
+
+        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
+        loss = torch.nn.functional.mse_loss(output, noise)
+        loss.backward()
+        ema_model.step(model)
+
+
+class UnetModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_channels": (32, 64),
+            "down_blocks": ("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
+            "up_blocks": ("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
+            "num_head_channels": None,
+            "out_channels": 3,
+            "in_channels": 3,
+            "num_res_blocks": 2,
+            "image_size": 32,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+
+#    TODO(Patrick) - Re-add this test after having correctly added the final VE checkpoints
+#    def test_output_pretrained(self):
+#        model = UNet2DModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet")
+#        model.eval()
+#
+#        torch.manual_seed(0)
+#        if torch.cuda.is_available():
+#            torch.cuda.manual_seed_all(0)
+#
+#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+#        time_step = torch.tensor([10])
+#
+#        with torch.no_grad():
+#            output = model(noise, time_step)["sample"]
+#
+#        output_slice = output[0, -1, -3:, -3:].flatten()
+# fmt: off
+#        expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053])
+# fmt: on
+#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+
+class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 4
+        sizes = (32, 32)
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor([10]).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (4, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (4, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "image_size": 32,
+            "in_channels": 4,
+            "out_channels": 4,
+            "num_res_blocks": 2,
+            "block_channels": (32, 64),
+            "num_head_channels": 32,
+            "conv_resample": True,
+            "down_blocks": ("UNetResDownBlock2D", "UNetResDownBlock2D"),
+            "up_blocks": ("UNetResUpBlock2D", "UNetResUpBlock2D"),
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/unet-ldm-dummy-update", output_loading_info=True)
+
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)["sample"]
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/unet-ldm-dummy-update")
+        model.eval()
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+        time_step = torch.tensor([10] * noise.shape[0])
+
+        with torch.no_grad():
+            output = model(noise, time_step)["sample"]
+
+        output_slice = output[0, -1, -3:, -3:].flatten()
+        # fmt: off
+        expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+
+
+#    TODO(Patrick) - Re-add this test after having cleaned up LDM
+#    def test_output_pretrained_spatial_transformer(self):
+#        model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial")
+#        model.eval()
+#
+#        torch.manual_seed(0)
+#        if torch.cuda.is_available():
+#            torch.cuda.manual_seed_all(0)
+#
+#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+#        context = torch.ones((1, 16, 64), dtype=torch.float32)
+#        time_step = torch.tensor([10] * noise.shape[0])
+#
+#        with torch.no_grad():
+#            output = model(noise, time_step, context=context)
+#
+#        output_slice = output[0, -1, -3:, -3:].flatten()
+# fmt: off
+#        expected_output_slice = torch.tensor([61.3445, 56.9005, 29.4339, 59.5497, 60.7375, 34.1719, 48.1951, 42.6569, 25.0890])
+# fmt: on
+#
+#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
+#
+
+
+class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = UNet2DModel
+
+    @property
+    def dummy_input(self, sizes=(32, 32)):
+        batch_size = 4
+        num_channels = 3
+
+        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [10]).to(torch_device)
+
+        return {"sample": noise, "timestep": time_step}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "block_channels": [32, 64, 64, 64],
+            "in_channels": 3,
+            "num_res_blocks": 1,
+            "out_channels": 3,
+            "time_embedding_type": "fourier",
+            "resnet_eps": 1e-6,
+            "mid_block_scale_factor": math.sqrt(2.0),
+            "resnet_num_groups": None,
+            "down_blocks": [
+                "UNetResSkipDownBlock2D",
+                "UNetResAttnSkipDownBlock2D",
+                "UNetResSkipDownBlock2D",
+                "UNetResSkipDownBlock2D",
+            ],
+            "up_blocks": [
+                "UNetResSkipUpBlock2D",
+                "UNetResSkipUpBlock2D",
+                "UNetResAttnSkipUpBlock2D",
+                "UNetResSkipUpBlock2D",
+            ],
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = UNet2DModel.from_pretrained(
+            "/home/patrick/google_checkpoints/ncsnpp-celebahq-256", output_loading_info=True
+
+        )
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input(sizes=(256, 256)))
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained_ve_mid(self):
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-celebahq-256")
+        model.to(torch_device)
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        batch_size = 4
+        num_channels = 3
+        sizes = (256, 256)
+
+        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, time_step)["sample"]
+
+        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+    def test_output_pretrained_ve_large(self):
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-ffhq-ve-dummy-update")
+        model.to(torch_device)
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
+        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
+
+        with torch.no_grad():
+            output = model(noise, time_step)["sample"]
+
+        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
+        # fmt: off
+        expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
+        # fmt: on
+
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+
+class VQModelTests(ModelTesterMixin, unittest.TestCase):
+    model_class = VQModel
+
+    @property
+    def dummy_input(self, sizes=(32, 32)):
+        batch_size = 4
+        num_channels = 3
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "ch": 64,
+            "out_ch": 3,
+            "num_res_blocks": 1,
+            "in_channels": 3,
+            "attn_resolutions": [],
+            "resolution": 32,
+            "z_channels": 3,
+            "n_embed": 256,
+            "embed_dim": 3,
+            "sane_index_shape": False,
+            "ch_mult": (1,),
+            "dropout": 0.0,
+            "double_z": False,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = VQModel.from_pretrained("/home/patrick/google_checkpoints/vqgan-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = VQModel.from_pretrained("/home/patrick/google_checkpoints/vqgan-dummy")
+        model.eval()
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        image = torch.randn(1, model.config.in_channels, model.config.resolution, model.config.resolution)
+        with torch.no_grad():
+            output = model(image)
+
+        output_slice = output[0, -1, -3:, -3:].flatten()
+        # fmt: off
+        expected_output_slice = torch.tensor([-1.1321, 0.1056, 0.3505, -0.6461, -0.2014, 0.0419, -0.5763, -0.8462, -0.4218])
+        # fmt: on
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+
+class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
+    model_class = AutoencoderKL
+
+    @property
+    def dummy_input(self):
+        batch_size = 4
+        num_channels = 3
+        sizes = (32, 32)
+
+        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
+
+        return {"sample": image}
+
+    @property
+    def input_shape(self):
+        return (3, 32, 32)
+
+    @property
+    def output_shape(self):
+        return (3, 32, 32)
+
+    def prepare_init_args_and_inputs_for_common(self):
+        init_dict = {
+            "ch": 64,
+            "ch_mult": (1,),
+            "embed_dim": 4,
+            "in_channels": 3,
+            "attn_resolutions": [],
+            "num_res_blocks": 1,
+            "out_ch": 3,
+            "resolution": 32,
+            "z_channels": 4,
+        }
+        inputs_dict = self.dummy_input
+        return init_dict, inputs_dict
+
+    def test_forward_signature(self):
+        pass
+
+    def test_training(self):
+        pass
+
+    def test_from_pretrained_hub(self):
+        model, loading_info = AutoencoderKL.from_pretrained("/home/patrick/google_checkpoints/autoencoder-kl-dummy", output_loading_info=True)
+        self.assertIsNotNone(model)
+        self.assertEqual(len(loading_info["missing_keys"]), 0)
+
+        model.to(torch_device)
+        image = model(**self.dummy_input)
+
+        assert image is not None, "Make sure output is not None"
+
+    def test_output_pretrained(self):
+        model = AutoencoderKL.from_pretrained("/home/patrick/google_checkpoints/autoencoder-kl-dummy")
+        model.eval()
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        image = torch.randn(1, model.config.in_channels, model.config.resolution, model.config.resolution)
+        with torch.no_grad():
+            output = model(image, sample_posterior=True)
+
+        output_slice = output[0, -1, -3:, -3:].flatten()
+        # fmt: off
+        expected_output_slice = torch.tensor([-0.0814, -0.0229, -0.1320, -0.4123, -0.0366, -0.3473, 0.0438, -0.1662, 0.1750])
+        # fmt: on
+        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
+
+
+class PipelineTesterMixin(unittest.TestCase):
+    def test_from_pretrained_save_pretrained(self):
+        # 1. Load models
+        model = UNet2DModel(
+            block_channels=(32, 64),
+            num_res_blocks=2,
+            image_size=32,
+            in_channels=3,
+            out_channels=3,
+            down_blocks=("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
+            up_blocks=("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
+        )
+        schedular = DDPMScheduler(num_train_timesteps=10)
+
+        ddpm = DDPMPipeline(model, schedular)
+
+        with tempfile.TemporaryDirectory() as tmpdirname:
+            ddpm.save_pretrained(tmpdirname)
+            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
+
+        generator = torch.manual_seed(0)
+
+        image = ddpm(generator=generator)["sample"]
+        generator = generator.manual_seed(0)
+        new_image = new_ddpm(generator=generator)["sample"]
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
+
+    @slow
+    def test_from_pretrained_hub(self):
+        model_path = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
+
+        ddpm = DDPMPipeline.from_pretrained(model_path)
+        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)
+
+        ddpm.scheduler.num_timesteps = 10
+        ddpm_from_hub.scheduler.num_timesteps = 10
+
+        generator = torch.manual_seed(0)
+
+        image = ddpm(generator=generator)["sample"]
+        generator = generator.manual_seed(0)
+        new_image = ddpm_from_hub(generator=generator)["sample"]
+
+        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
+
+    @slow
+    def test_ddpm_cifar10(self):
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDPMScheduler.from_config(model_id)
+        scheduler = scheduler.set_format("pt")
+
+        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.41995, 0.35885, 0.19385, 0.38475, 0.3382, 0.2647, 0.41545, 0.3582, 0.33845])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_ddim_lsun(self):
+        model_id = "/home/patrick/google_checkpoints/ddpm-ema-bedroom-256"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler.from_config(model_id)
+
+        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
+
+        generator = torch.manual_seed(0)
+        image = ddpm(generator=generator)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.00605, 0.0201, 0.0344, 0.00235, 0.00185, 0.00025, 0.00215, 0.0, 0.00685])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_ddim_cifar10(self):
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = DDIMScheduler(tensor_format="pt")
+
+        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
+
+        generator = torch.manual_seed(0)
+        image = ddim(generator=generator, eta=0.0)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.17235, 0.16175, 0.16005, 0.16255, 0.1497, 0.1513, 0.15045, 0.1442, 0.1453])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_pndm_cifar10(self):
+        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
+
+        unet = UNet2DModel.from_pretrained(model_id)
+        scheduler = PNDMScheduler(tensor_format="pt")
+
+        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
+        generator = torch.manual_seed(0)
+        image = pndm(generator=generator)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_ldm_text2img(self):
+        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.9256, 0.9340, 0.8933, 0.9361, 0.9113, 0.8727, 0.9122, 0.8745, 0.8099])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_ldm_text2img_fast(self):
+        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
+
+        prompt = "A painting of a squirrel eating a burger"
+        generator = torch.manual_seed(0)
+        image = ldm([prompt], generator=generator, num_inference_steps=1)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_score_sde_ve_pipeline(self):
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-church-256")
+
+        torch.manual_seed(0)
+        if torch.cuda.is_available():
+            torch.cuda.manual_seed_all(0)
+
+        scheduler = ScoreSdeVeScheduler.from_config("/home/patrick/google_checkpoints/ncsnpp-church-256")
+
+        sde_ve = ScoreSdeVePipeline(model=model, scheduler=scheduler)
+
+        torch.manual_seed(0)
+        image = sde_ve(num_inference_steps=300)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.64363, 0.5868, 0.3031, 0.2284, 0.7409, 0.3216, 0.25643, 0.6557, 0.2633])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
+
+    @slow
+    def test_ldm_uncond(self):
+        ldm = LatentDiffusionUncondPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-celebahq-256")
+
+        generator = torch.manual_seed(0)
+        image = ldm(generator=generator, num_inference_steps=5)["sample"]
+
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 256, 256, 3)
+        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/src/diffusers/models/__init__.py b/src/diffusers/models/__init__.py
index dbafcdf26ca8..0c19b49b148d 100644
--- a/src/diffusers/models/__init__.py
+++ b/src/diffusers/models/__init__.py
@@ -16,6 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unet_2d_condition import UNet2DConditionModel
 from .unet_2d import UNet2DModel
+from .unet_2d_condition import UNet2DConditionModel
 from .vae import AutoencoderKL, VQModel
diff --git a/src/diffusers/schedulers/scheduling_sde_ve.py b/src/diffusers/schedulers/scheduling_sde_ve.py
index 2f21faa2bf2a..92975a3ffd24 100644
--- a/src/diffusers/schedulers/scheduling_sde_ve.py
+++ b/src/diffusers/schedulers/scheduling_sde_ve.py
@@ -15,7 +15,6 @@
 # DISCLAIMER: This file is strongly influenced by https://github.com/yang-song/score_sde_pytorch
 
 # TODO(Patrick, Anton, Suraj) - make scheduler framework indepedent and clean-up a bit
-import pdb
 from typing import Union
 
 import numpy as np
@@ -55,39 +54,35 @@ def __init__(
         # self.num_inference_steps = None
         self.timesteps = None
 
-        self.set_sigmas(self.num_train_timesteps)
+        self.set_sigmas(num_train_timesteps, sigma_min, sigma_max, sampling_eps)
 
         self.tensor_format = tensor_format
         self.set_format(tensor_format=tensor_format)
 
-    def set_timesteps(self, num_inference_steps):
+    def set_timesteps(self, num_inference_steps, sampling_eps=None):
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
         tensor_format = getattr(self, "tensor_format", "pt")
         if tensor_format == "np":
-            self.timesteps = np.linspace(1, self.config.sampling_eps, num_inference_steps)
+            self.timesteps = np.linspace(1, sampling_eps, num_inference_steps)
         elif tensor_format == "pt":
-            self.timesteps = torch.linspace(1, self.config.sampling_eps, num_inference_steps)
+            self.timesteps = torch.linspace(1, sampling_eps, num_inference_steps)
         else:
             raise ValueError(f"`self.tensor_format`: {self.tensor_format} is not valid.")
 
-    def set_sigmas(self, num_inference_steps):
+    def set_sigmas(self, num_inference_steps, sigma_min=None, sigma_max=None, sampling_eps=None):
+        sigma_min = sigma_min if sigma_min is not None else self.config.sigma_min
+        sigma_max = sigma_max if sigma_max is not None else self.config.sigma_max
+        sampling_eps = sampling_eps if sampling_eps is not None else self.config.sampling_eps
         if self.timesteps is None:
-            self.set_timesteps(num_inference_steps)
+            self.set_timesteps(num_inference_steps, sampling_eps)
 
         tensor_format = getattr(self, "tensor_format", "pt")
         if tensor_format == "np":
-            self.discrete_sigmas = np.exp(
-                np.linspace(np.log(self.config.sigma_min), np.log(self.config.sigma_max), num_inference_steps)
-            )
-            self.sigmas = np.array(
-                [self.config.sigma_min * (self.config.sigma_max / self.sigma_min) ** t for t in self.timesteps]
-            )
+            self.discrete_sigmas = np.exp(np.linspace(np.log(sigma_min), np.log(sigma_max), num_inference_steps))
+            self.sigmas = np.array([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
         elif tensor_format == "pt":
-            self.discrete_sigmas = torch.exp(
-                torch.linspace(np.log(self.config.sigma_min), np.log(self.config.sigma_max), num_inference_steps)
-            )
-            self.sigmas = torch.tensor(
-                [self.config.sigma_min * (self.config.sigma_max / self.sigma_min) ** t for t in self.timesteps]
-            )
+            self.discrete_sigmas = torch.exp(torch.linspace(np.log(sigma_min), np.log(sigma_max), num_inference_steps))
+            self.sigmas = torch.tensor([sigma_min * (sigma_max / sigma_min) ** t for t in self.timesteps])
         else:
             raise ValueError(f"`self.tensor_format`: {self.tensor_format} is not valid.")
 
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 8c6a5a7bd402..c7b87e5add7a 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -368,8 +368,9 @@ def prepare_init_args_and_inputs_for_common(self):
 
     def test_from_pretrained_hub(self):
         model, loading_info = UNet2DModel.from_pretrained(
-            "fusing/unet-ldm-dummy-update", output_loading_info=True
+            "/home/patrick/google_checkpoints/unet-ldm-dummy-update", output_loading_info=True
         )
+
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
@@ -379,7 +380,7 @@ def test_from_pretrained_hub(self):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = UNet2DModel.from_pretrained("fusing/unet-ldm-dummy-update")
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/unet-ldm-dummy-update")
         model.eval()
 
         torch.manual_seed(0)
@@ -429,10 +430,9 @@ class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = UNet2DModel
 
     @property
-    def dummy_input(self):
+    def dummy_input(self, sizes=(32, 32)):
         batch_size = 4
         num_channels = 3
-        sizes = (32, 32)
 
         noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
         time_step = torch.tensor(batch_size * [10]).to(torch_device)
@@ -475,18 +475,21 @@ def prepare_init_args_and_inputs_for_common(self):
 
     def test_from_pretrained_hub(self):
         model, loading_info = UNet2DModel.from_pretrained(
-            "fusing/ncsnpp-ffhq-ve-dummy-update", output_loading_info=True
+            "/home/patrick/google_checkpoints/ncsnpp-celebahq-256", output_loading_info=True
         )
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
         model.to(torch_device)
-        image = model(**self.dummy_input)
+        inputs = self.dummy_input
+        noise = floats_tensor((4, 3) + (256, 256)).to(torch_device)
+        inputs["sample"] = noise
+        image = model(**inputs)
 
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("google/ncsnpp-celebahq-256")
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-celebahq-256")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -511,7 +514,7 @@ def test_output_pretrained_ve_mid(self):
         self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
 
     def test_output_pretrained_ve_large(self):
-        model = UNet2DModel.from_pretrained("fusing/ncsnpp-ffhq-ve-dummy-update")
+        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-ffhq-ve-dummy-update")
         model.to(torch_device)
 
         torch.manual_seed(0)
@@ -540,10 +543,9 @@ class VQModelTests(ModelTesterMixin, unittest.TestCase):
     model_class = VQModel
 
     @property
-    def dummy_input(self):
+    def dummy_input(self, sizes=(32, 32)):
         batch_size = 4
         num_channels = 3
-        sizes = (32, 32)
 
         image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
 
@@ -583,7 +585,9 @@ def test_training(self):
         pass
 
     def test_from_pretrained_hub(self):
-        model, loading_info = VQModel.from_pretrained("fusing/vqgan-dummy", output_loading_info=True)
+        model, loading_info = VQModel.from_pretrained(
+            "/home/patrick/google_checkpoints/vqgan-dummy", output_loading_info=True
+        )
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
@@ -593,7 +597,7 @@ def test_from_pretrained_hub(self):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = VQModel.from_pretrained("fusing/vqgan-dummy")
+        model = VQModel.from_pretrained("/home/patrick/google_checkpoints/vqgan-dummy")
         model.eval()
 
         torch.manual_seed(0)
@@ -654,7 +658,9 @@ def test_training(self):
         pass
 
     def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy", output_loading_info=True)
+        model, loading_info = AutoencoderKL.from_pretrained(
+            "/home/patrick/google_checkpoints/autoencoder-kl-dummy", output_loading_info=True
+        )
         self.assertIsNotNone(model)
         self.assertEqual(len(loading_info["missing_keys"]), 0)
 
@@ -664,7 +670,7 @@ def test_from_pretrained_hub(self):
         assert image is not None, "Make sure output is not None"
 
     def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("fusing/autoencoder-kl-dummy")
+        model = AutoencoderKL.from_pretrained("/home/patrick/google_checkpoints/autoencoder-kl-dummy")
         model.eval()
 
         torch.manual_seed(0)

From d4c63161d20a9fa133d98951c0e179d32cfb6abd Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 17:28:23 +0000
Subject: [PATCH 06/12] up

---
 @                                             | 861 ------------------
 _                                             |  82 ++
 scripts/change_configs.py                     |  51 +-
 src/diffusers/configuration_utils.py          |  48 +-
 src/diffusers/dynamic_modules_utils.py        |  20 +-
 src/diffusers/modeling_utils.py               |  58 +-
 src/diffusers/models/attention.py             |   9 +-
 src/diffusers/models/unet_2d.py               |   3 -
 src/diffusers/pipeline_utils.py               |   8 +-
 src/diffusers/pipelines/ddim/pipeline_ddim.py |   4 +-
 tests/test_modeling_utils.py                  |   4 +-
 11 files changed, 190 insertions(+), 958 deletions(-)
 delete mode 100644 @
 create mode 100644 _

diff --git a/@ b/@
deleted file mode 100644
index 4f932be5b4ca..000000000000
--- a/@
+++ /dev/null
@@ -1,861 +0,0 @@
-# coding=utf-8
-# Copyright 2022 HuggingFace Inc.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-
-import inspect
-import math
-import tempfile
-import unittest
-from atexit import register
-
-import numpy as np
-import torch
-
-from diffusers import UNet2DConditionModel  # noqa: F401 TODO(Patrick) - need to write tests with it
-from diffusers import (
-    AutoencoderKL,
-    DDIMPipeline,
-    DDIMScheduler,
-    DDPMPipeline,
-    DDPMScheduler,
-    LatentDiffusionPipeline,
-    LatentDiffusionUncondPipeline,
-    PNDMPipeline,
-    PNDMScheduler,
-    ScoreSdeVePipeline,
-    ScoreSdeVeScheduler,
-    UNet2DModel,
-    VQModel,
-)
-from diffusers.configuration_utils import ConfigMixin, register_to_config
-from diffusers.pipeline_utils import DiffusionPipeline
-from diffusers.testing_utils import floats_tensor, slow, torch_device
-from diffusers.training_utils import EMAModel
-
-
-torch.backends.cuda.matmul.allow_tf32 = False
-
-
-class SampleObject(ConfigMixin):
-    config_name = "config.json"
-
-    @register_to_config
-    def __init__(
-        self,
-        a=2,
-        b=5,
-        c=(2, 5),
-        d="for diffusion",
-        e=[1, 3],
-    ):
-        pass
-
-
-class ConfigTester(unittest.TestCase):
-    def test_load_not_from_mixin(self):
-        with self.assertRaises(ValueError):
-            ConfigMixin.from_config("dummy_path")
-
-    def test_register_to_config(self):
-        obj = SampleObject()
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # init ignore private arguments
-        obj = SampleObject(_name_or_path="lalala")
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # can override default
-        obj = SampleObject(c=6)
-        config = obj.config
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        # can use positional arguments.
-        obj = SampleObject(1, c=6)
-        config = obj.config
-        assert config["a"] == 1
-        assert config["b"] == 5
-        assert config["c"] == 6
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-    def test_save_load(self):
-        obj = SampleObject()
-        config = obj.config
-
-        assert config["a"] == 2
-        assert config["b"] == 5
-        assert config["c"] == (2, 5)
-        assert config["d"] == "for diffusion"
-        assert config["e"] == [1, 3]
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            obj.save_config(tmpdirname)
-            new_obj = SampleObject.from_config(tmpdirname)
-            new_config = new_obj.config
-
-        # unfreeze configs
-        config = dict(config)
-        new_config = dict(new_config)
-
-        assert config.pop("c") == (2, 5)  # instantiated as tuple
-        assert new_config.pop("c") == [2, 5]  # saved & loaded as list because of json
-        assert config == new_config
-
-
-class ModelTesterMixin:
-    def test_from_pretrained_save_pretrained(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_pretrained(tmpdirname)
-            new_model = self.model_class.from_pretrained(tmpdirname)
-            new_model.to(torch_device)
-
-        with torch.no_grad():
-            image = model(**inputs_dict)
-            if isinstance(image, dict):
-                image = image["sample"]
-
-            new_image = new_model(**inputs_dict)
-
-            if isinstance(new_image, dict):
-                new_image = new_image["sample"]
-
-        max_diff = (image - new_image).abs().sum().item()
-        self.assertLessEqual(max_diff, 5e-5, "Models give different forward passes")
-
-    def test_determinism(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-        with torch.no_grad():
-            first = model(**inputs_dict)
-            if isinstance(first, dict):
-                first = first["sample"]
-
-            second = model(**inputs_dict)
-            if isinstance(second, dict):
-                second = second["sample"]
-
-        out_1 = first.cpu().numpy()
-        out_2 = second.cpu().numpy()
-        out_1 = out_1[~np.isnan(out_1)]
-        out_2 = out_2[~np.isnan(out_2)]
-        max_diff = np.amax(np.abs(out_1 - out_2))
-        self.assertLessEqual(max_diff, 1e-5)
-
-    def test_output(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        with torch.no_grad():
-            output = model(**inputs_dict)
-
-            if isinstance(output, dict):
-                output = output["sample"]
-
-        self.assertIsNotNone(output)
-        expected_shape = inputs_dict["sample"].shape
-        self.assertEqual(output.shape, expected_shape, "Input and output shapes do not match")
-
-    def test_forward_signature(self):
-        init_dict, _ = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        signature = inspect.signature(model.forward)
-        # signature.parameters is an OrderedDict => so arg_names order is deterministic
-        arg_names = [*signature.parameters.keys()]
-
-        expected_arg_names = ["sample", "timestep"]
-        self.assertListEqual(arg_names[:2], expected_arg_names)
-
-    def test_model_from_config(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.eval()
-
-        # test if the model can be loaded from the config
-        # and has all the expected shape
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            model.save_config(tmpdirname)
-            new_model = self.model_class.from_config(tmpdirname)
-            new_model.to(torch_device)
-            new_model.eval()
-
-        # check if all paramters shape are the same
-        for param_name in model.state_dict().keys():
-            param_1 = model.state_dict()[param_name]
-            param_2 = new_model.state_dict()[param_name]
-            self.assertEqual(param_1.shape, param_2.shape)
-
-        with torch.no_grad():
-            output_1 = model(**inputs_dict)
-
-            if isinstance(output_1, dict):
-                output_1 = output_1["sample"]
-
-            output_2 = new_model(**inputs_dict)
-
-            if isinstance(output_2, dict):
-                output_2 = output_2["sample"]
-
-        self.assertEqual(output_1.shape, output_2.shape)
-
-    def test_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output["sample"]
-
-        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-
-    def test_ema_training(self):
-        init_dict, inputs_dict = self.prepare_init_args_and_inputs_for_common()
-
-        model = self.model_class(**init_dict)
-        model.to(torch_device)
-        model.train()
-        ema_model = EMAModel(model, device=torch_device)
-
-        output = model(**inputs_dict)
-
-        if isinstance(output, dict):
-            output = output["sample"]
-
-        noise = torch.randn((inputs_dict["sample"].shape[0],) + self.output_shape).to(torch_device)
-        loss = torch.nn.functional.mse_loss(output, noise)
-        loss.backward()
-        ema_model.step(model)
-
-
-class UnetModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_channels": (32, 64),
-            "down_blocks": ("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
-            "up_blocks": ("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
-            "num_head_channels": None,
-            "out_channels": 3,
-            "in_channels": 3,
-            "num_res_blocks": 2,
-            "image_size": 32,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-
-#    TODO(Patrick) - Re-add this test after having correctly added the final VE checkpoints
-#    def test_output_pretrained(self):
-#        model = UNet2DModel.from_pretrained("fusing/ddpm_dummy_update", subfolder="unet")
-#        model.eval()
-#
-#        torch.manual_seed(0)
-#        if torch.cuda.is_available():
-#            torch.cuda.manual_seed_all(0)
-#
-#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
-#        time_step = torch.tensor([10])
-#
-#        with torch.no_grad():
-#            output = model(noise, time_step)["sample"]
-#
-#        output_slice = output[0, -1, -3:, -3:].flatten()
-# fmt: off
-#        expected_output_slice = torch.tensor([0.2891, -0.1899, 0.2595, -0.6214, 0.0968, -0.2622, 0.4688, 0.1311, 0.0053])
-# fmt: on
-#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-
-class UNetLDMModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 4
-        sizes = (32, 32)
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor([10]).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (4, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (4, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "image_size": 32,
-            "in_channels": 4,
-            "out_channels": 4,
-            "num_res_blocks": 2,
-            "block_channels": (32, 64),
-            "num_head_channels": 32,
-            "conv_resample": True,
-            "down_blocks": ("UNetResDownBlock2D", "UNetResDownBlock2D"),
-            "up_blocks": ("UNetResUpBlock2D", "UNetResUpBlock2D"),
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/unet-ldm-dummy-update", output_loading_info=True)
-
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)["sample"]
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/unet-ldm-dummy-update")
-        model.eval()
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
-        time_step = torch.tensor([10] * noise.shape[0])
-
-        with torch.no_grad():
-            output = model(noise, time_step)["sample"]
-
-        output_slice = output[0, -1, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-13.3258, -20.1100, -15.9873, -17.6617, -23.0596, -17.9419, -13.3675, -16.1889, -12.3800])
-        # fmt: on
-
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
-
-
-#    TODO(Patrick) - Re-add this test after having cleaned up LDM
-#    def test_output_pretrained_spatial_transformer(self):
-#        model = UNetLDMModel.from_pretrained("fusing/unet-ldm-dummy-spatial")
-#        model.eval()
-#
-#        torch.manual_seed(0)
-#        if torch.cuda.is_available():
-#            torch.cuda.manual_seed_all(0)
-#
-#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
-#        context = torch.ones((1, 16, 64), dtype=torch.float32)
-#        time_step = torch.tensor([10] * noise.shape[0])
-#
-#        with torch.no_grad():
-#            output = model(noise, time_step, context=context)
-#
-#        output_slice = output[0, -1, -3:, -3:].flatten()
-# fmt: off
-#        expected_output_slice = torch.tensor([61.3445, 56.9005, 29.4339, 59.5497, 60.7375, 34.1719, 48.1951, 42.6569, 25.0890])
-# fmt: on
-#
-#        self.assertTrue(torch.allclose(output_slice, expected_output_slice, atol=1e-3))
-#
-
-
-class NCSNppModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = UNet2DModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-
-        noise = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [10]).to(torch_device)
-
-        return {"sample": noise, "timestep": time_step}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "block_channels": [32, 64, 64, 64],
-            "in_channels": 3,
-            "num_res_blocks": 1,
-            "out_channels": 3,
-            "time_embedding_type": "fourier",
-            "resnet_eps": 1e-6,
-            "mid_block_scale_factor": math.sqrt(2.0),
-            "resnet_num_groups": None,
-            "down_blocks": [
-                "UNetResSkipDownBlock2D",
-                "UNetResAttnSkipDownBlock2D",
-                "UNetResSkipDownBlock2D",
-                "UNetResSkipDownBlock2D",
-            ],
-            "up_blocks": [
-                "UNetResSkipUpBlock2D",
-                "UNetResSkipUpBlock2D",
-                "UNetResAttnSkipUpBlock2D",
-                "UNetResSkipUpBlock2D",
-            ],
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = UNet2DModel.from_pretrained(
-            "/home/patrick/google_checkpoints/ncsnpp-celebahq-256", output_loading_info=True
-
-        )
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input(sizes=(256, 256)))
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained_ve_mid(self):
-        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-celebahq-256")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        batch_size = 4
-        num_channels = 3
-        sizes = (256, 256)
-
-        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, time_step)["sample"]
-
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-4836.2231, -6487.1387, -3816.7969, -7964.9253, -10966.2842, -20043.6016, 8137.0571, 2340.3499, 544.6114])
-        # fmt: on
-
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-    def test_output_pretrained_ve_large(self):
-        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-ffhq-ve-dummy-update")
-        model.to(torch_device)
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        noise = torch.ones((batch_size, num_channels) + sizes).to(torch_device)
-        time_step = torch.tensor(batch_size * [1e-4]).to(torch_device)
-
-        with torch.no_grad():
-            output = model(noise, time_step)["sample"]
-
-        output_slice = output[0, -3:, -3:, -1].flatten().cpu()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.0325, -0.0900, -0.0869, -0.0332, -0.0725, -0.0270, -0.0101, 0.0227, 0.0256])
-        # fmt: on
-
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-
-class VQModelTests(ModelTesterMixin, unittest.TestCase):
-    model_class = VQModel
-
-    @property
-    def dummy_input(self, sizes=(32, 32)):
-        batch_size = 4
-        num_channels = 3
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "ch": 64,
-            "out_ch": 3,
-            "num_res_blocks": 1,
-            "in_channels": 3,
-            "attn_resolutions": [],
-            "resolution": 32,
-            "z_channels": 3,
-            "n_embed": 256,
-            "embed_dim": 3,
-            "sane_index_shape": False,
-            "ch_mult": (1,),
-            "dropout": 0.0,
-            "double_z": False,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = VQModel.from_pretrained("/home/patrick/google_checkpoints/vqgan-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = VQModel.from_pretrained("/home/patrick/google_checkpoints/vqgan-dummy")
-        model.eval()
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        image = torch.randn(1, model.config.in_channels, model.config.resolution, model.config.resolution)
-        with torch.no_grad():
-            output = model(image)
-
-        output_slice = output[0, -1, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-1.1321, 0.1056, 0.3505, -0.6461, -0.2014, 0.0419, -0.5763, -0.8462, -0.4218])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-
-class AutoencoderKLTests(ModelTesterMixin, unittest.TestCase):
-    model_class = AutoencoderKL
-
-    @property
-    def dummy_input(self):
-        batch_size = 4
-        num_channels = 3
-        sizes = (32, 32)
-
-        image = floats_tensor((batch_size, num_channels) + sizes).to(torch_device)
-
-        return {"sample": image}
-
-    @property
-    def input_shape(self):
-        return (3, 32, 32)
-
-    @property
-    def output_shape(self):
-        return (3, 32, 32)
-
-    def prepare_init_args_and_inputs_for_common(self):
-        init_dict = {
-            "ch": 64,
-            "ch_mult": (1,),
-            "embed_dim": 4,
-            "in_channels": 3,
-            "attn_resolutions": [],
-            "num_res_blocks": 1,
-            "out_ch": 3,
-            "resolution": 32,
-            "z_channels": 4,
-        }
-        inputs_dict = self.dummy_input
-        return init_dict, inputs_dict
-
-    def test_forward_signature(self):
-        pass
-
-    def test_training(self):
-        pass
-
-    def test_from_pretrained_hub(self):
-        model, loading_info = AutoencoderKL.from_pretrained("/home/patrick/google_checkpoints/autoencoder-kl-dummy", output_loading_info=True)
-        self.assertIsNotNone(model)
-        self.assertEqual(len(loading_info["missing_keys"]), 0)
-
-        model.to(torch_device)
-        image = model(**self.dummy_input)
-
-        assert image is not None, "Make sure output is not None"
-
-    def test_output_pretrained(self):
-        model = AutoencoderKL.from_pretrained("/home/patrick/google_checkpoints/autoencoder-kl-dummy")
-        model.eval()
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        image = torch.randn(1, model.config.in_channels, model.config.resolution, model.config.resolution)
-        with torch.no_grad():
-            output = model(image, sample_posterior=True)
-
-        output_slice = output[0, -1, -3:, -3:].flatten()
-        # fmt: off
-        expected_output_slice = torch.tensor([-0.0814, -0.0229, -0.1320, -0.4123, -0.0366, -0.3473, 0.0438, -0.1662, 0.1750])
-        # fmt: on
-        self.assertTrue(torch.allclose(output_slice, expected_output_slice, rtol=1e-2))
-
-
-class PipelineTesterMixin(unittest.TestCase):
-    def test_from_pretrained_save_pretrained(self):
-        # 1. Load models
-        model = UNet2DModel(
-            block_channels=(32, 64),
-            num_res_blocks=2,
-            image_size=32,
-            in_channels=3,
-            out_channels=3,
-            down_blocks=("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
-            up_blocks=("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
-        )
-        schedular = DDPMScheduler(num_train_timesteps=10)
-
-        ddpm = DDPMPipeline(model, schedular)
-
-        with tempfile.TemporaryDirectory() as tmpdirname:
-            ddpm.save_pretrained(tmpdirname)
-            new_ddpm = DDPMPipeline.from_pretrained(tmpdirname)
-
-        generator = torch.manual_seed(0)
-
-        image = ddpm(generator=generator)["sample"]
-        generator = generator.manual_seed(0)
-        new_image = new_ddpm(generator=generator)["sample"]
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    @slow
-    def test_from_pretrained_hub(self):
-        model_path = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
-
-        ddpm = DDPMPipeline.from_pretrained(model_path)
-        ddpm_from_hub = DiffusionPipeline.from_pretrained(model_path)
-
-        ddpm.scheduler.num_timesteps = 10
-        ddpm_from_hub.scheduler.num_timesteps = 10
-
-        generator = torch.manual_seed(0)
-
-        image = ddpm(generator=generator)["sample"]
-        generator = generator.manual_seed(0)
-        new_image = ddpm_from_hub(generator=generator)["sample"]
-
-        assert np.abs(image - new_image).sum() < 1e-5, "Models don't give the same forward pass"
-
-    @slow
-    def test_ddpm_cifar10(self):
-        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDPMScheduler.from_config(model_id)
-        scheduler = scheduler.set_format("pt")
-
-        ddpm = DDPMPipeline(unet=unet, scheduler=scheduler)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.41995, 0.35885, 0.19385, 0.38475, 0.3382, 0.2647, 0.41545, 0.3582, 0.33845])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ddim_lsun(self):
-        model_id = "/home/patrick/google_checkpoints/ddpm-ema-bedroom-256"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler.from_config(model_id)
-
-        ddpm = DDIMPipeline(unet=unet, scheduler=scheduler)
-
-        generator = torch.manual_seed(0)
-        image = ddpm(generator=generator)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.00605, 0.0201, 0.0344, 0.00235, 0.00185, 0.00025, 0.00215, 0.0, 0.00685])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ddim_cifar10(self):
-        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = DDIMScheduler(tensor_format="pt")
-
-        ddim = DDIMPipeline(unet=unet, scheduler=scheduler)
-
-        generator = torch.manual_seed(0)
-        image = ddim(generator=generator, eta=0.0)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.17235, 0.16175, 0.16005, 0.16255, 0.1497, 0.1513, 0.15045, 0.1442, 0.1453])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_pndm_cifar10(self):
-        model_id = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
-
-        unet = UNet2DModel.from_pretrained(model_id)
-        scheduler = PNDMScheduler(tensor_format="pt")
-
-        pndm = PNDMPipeline(unet=unet, scheduler=scheduler)
-        generator = torch.manual_seed(0)
-        image = pndm(generator=generator)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 32, 32, 3)
-        expected_slice = np.array([0.1564, 0.14645, 0.1406, 0.14715, 0.12425, 0.14045, 0.13115, 0.12175, 0.125])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_text2img(self):
-        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm([prompt], generator=generator, guidance_scale=6.0, num_inference_steps=20)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.9256, 0.9340, 0.8933, 0.9361, 0.9113, 0.8727, 0.9122, 0.8745, 0.8099])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_text2img_fast(self):
-        ldm = LatentDiffusionPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-text2im-large-256")
-
-        prompt = "A painting of a squirrel eating a burger"
-        generator = torch.manual_seed(0)
-        image = ldm([prompt], generator=generator, num_inference_steps=1)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.3163, 0.8670, 0.6465, 0.1865, 0.6291, 0.5139, 0.2824, 0.3723, 0.4344])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_score_sde_ve_pipeline(self):
-        model = UNet2DModel.from_pretrained("/home/patrick/google_checkpoints/ncsnpp-church-256")
-
-        torch.manual_seed(0)
-        if torch.cuda.is_available():
-            torch.cuda.manual_seed_all(0)
-
-        scheduler = ScoreSdeVeScheduler.from_config("/home/patrick/google_checkpoints/ncsnpp-church-256")
-
-        sde_ve = ScoreSdeVePipeline(model=model, scheduler=scheduler)
-
-        torch.manual_seed(0)
-        image = sde_ve(num_inference_steps=300)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.64363, 0.5868, 0.3031, 0.2284, 0.7409, 0.3216, 0.25643, 0.6557, 0.2633])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
-
-    @slow
-    def test_ldm_uncond(self):
-        ldm = LatentDiffusionUncondPipeline.from_pretrained("/home/patrick/google_checkpoints/ldm-celebahq-256")
-
-        generator = torch.manual_seed(0)
-        image = ldm(generator=generator, num_inference_steps=5)["sample"]
-
-        image_slice = image[0, -3:, -3:, -1]
-
-        assert image.shape == (1, 256, 256, 3)
-        expected_slice = np.array([0.4399, 0.44975, 0.46825, 0.474, 0.4359, 0.4581, 0.45095, 0.4341, 0.4447])
-        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-2
diff --git a/_ b/_
new file mode 100644
index 000000000000..cf1443d4261f
--- /dev/null
+++ b/_
@@ -0,0 +1,82 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" Conversion script for the LDM checkpoints. """
+
+import argparse
+import os
+import json
+import torch
+from diffusers import UNet2DModel
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument(
+        "--repo_path",
+        default=None,
+        type=str,
+        required=True,
+        help="The config json file corresponding to the architecture.",
+    )
+
+    parser.add_argument(
+        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
+    )
+
+    args = parser.parse_args()
+
+    config_parameters_to_change = {
+
+    }
+
+    key_parameters_to_change = {
+    
+    }
+
+    model = UNet2DModel.from_config(args.repo_path)
+    config = dict(model.config)
+
+    for key, value in config_parameters_to_change.items():
+        if key in config:
+            if isinstance(value, dict):
+                new_list = []
+
+                for block_name in config[key]:
+                    # map old block name to new one
+                    new_list.append(value[block_name])
+            else:
+                config[key] = value
+
+    state_dict = torch.load(os.path.join(args.repo_path, "diffusion_pytorch_model.bin"))
+
+    new_state_dict = {}
+    for key, new_key in key_parameters_to_change.items():
+        for param_key, param_value in state_dict.items():
+            if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
+                continue
+            else:
+                new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
+
+    model.load_state_dict(state_dict)
+
+    try:
+        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
+        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+
+        pipe = LatentDiffusionUncondPipeline(unet=model, scheduler=scheduler, vae=vqvae)
+        pipe.save_pretrained(args.dump_path)
+    except:
+        model.save_pretrained(args.dump_path)
diff --git a/scripts/change_configs.py b/scripts/change_configs.py
index 49b1cef67fc1..191f607f48bb 100644
--- a/scripts/change_configs.py
+++ b/scripts/change_configs.py
@@ -15,7 +15,7 @@
 """ Conversion script for the LDM checkpoints. """
 
 import argparse
-import json
+import os
 import torch
 from diffusers import UNet2DModel
 
@@ -23,12 +23,8 @@
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
 
-    config_parameters_to_change = {
-
-    }
-
     parser.add_argument(
-        "--config_file",
+        "--repo_path",
         default=None,
         type=str,
         required=True,
@@ -41,9 +37,26 @@
 
     args = parser.parse_args()
 
-    with open(args.config_file) as f:
-        config = json.loads(f.read())
+    config_parameters_to_change = {
+        "image_size": "sample_size",
+        "num_res_blocks": "layers_per_block",
+        "block_channels": "block_out_channels",
+        "downscale_freq_shift": "freq_shift",
+        "resnet_num_groups": "num_groups_norm",
+        "resnet_act_fn": "act_fn",
+        "resnet_eps": "norm_eps",
+        "num_head_channels": "attention_head_dim",
+    }
+
+    key_parameters_to_change = {
+        "time_steps": "time_proj",
+        "mid": "mid_block",
+        "downsample_blocks": "down_blocks",
+        "upsample_blocks": "up_blocks",
+    }
 
+    model = UNet2DModel.from_config(args.repo_path)
+    config = dict(model.config)
 
     for key, value in config_parameters_to_change.items():
         if key in config:
@@ -56,16 +69,20 @@
             else:
                 config[key] = value
 
+    config["down_blocks"] = [k.replace("UNetRes", "") for k in config["down_blocks"]]
+    config["up_blocks"] = [k.replace("UNetRes", "") for k in config["up_blocks"]]
 
-    state_dict = torch.load(
     model = UNet2DModel(**config)
-    model.load_state_dict(converted_checkpoint)
 
-    try:
-        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
-        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
+    state_dict = torch.load(os.path.join(args.repo_path, "diffusion_pytorch_model.bin"))
+
+    new_state_dict = {}
+    for key, new_key in key_parameters_to_change.items():
+        for param_key, param_value in state_dict.items():
+            if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
+                continue
+            else:
+                new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
 
-        pipe = LatentDiffusionUncondPipeline(unet=model, scheduler=scheduler, vae=vqvae)
-        pipe.save_pretrained(args.dump_path)
-    except:
-        model.save_pretrained(args.dump_path)
+    model.load_state_dict(state_dict)
+    model.save_pretrained(args.repo_path + "_new")
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index a59a1e7988a0..ed5406a84ada 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -94,8 +94,10 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
         logger.info(f"ConfigMixinuration saved in {output_config_file}")
 
     @classmethod
-    def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
-        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
+    def from_config(
+        cls, pretrained_model__name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
+    ):
+        config_dict = cls.get_config_dict(pretrained_model__name_or_path=pretrained_model__name_or_path, **kwargs)
 
         init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
 
@@ -108,7 +110,7 @@ def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], ret
 
     @classmethod
     def get_config_dict(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
+        cls, pretrained_model__name_or_path: Union[str, os.PathLike], **kwargs
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
@@ -121,7 +123,7 @@ def get_config_dict(
 
         user_agent = {"file_type": "config"}
 
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        pretrained_model__name_or_path = str(pretrained_model__name_or_path)
 
         if cls.config_name is None:
             raise ValueError(
@@ -129,25 +131,25 @@ def get_config_dict(
                 "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
             )
 
-        if os.path.isfile(pretrained_model_name_or_path):
-            config_file = pretrained_model_name_or_path
-        elif os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
+        if os.path.isfile(pretrained_model__name_or_path):
+            config_file = pretrained_model__name_or_path
+        elif os.path.isdir(pretrained_model__name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model__name_or_path, cls.config_name)):
                 # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
+                config_file = os.path.join(pretrained_model__name_or_path, cls.config_name)
             elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+                os.path.join(pretrained_model__name_or_path, subfolder, cls.config_name)
             ):
-                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
+                config_file = os.path.join(pretrained_model__name_or_path, subfolder, cls.config_name)
             else:
                 raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model__name_or_path}."
                 )
         else:
             try:
                 # Load from URL or cache if already cached
                 config_file = hf_hub_download(
-                    pretrained_model_name_or_path,
+                    pretrained_model__name_or_path,
                     filename=cls.config_name,
                     cache_dir=cache_dir,
                     force_download=force_download,
@@ -161,39 +163,39 @@ def get_config_dict(
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed"
-                    " on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token"
-                    " having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and"
-                    " pass `use_auth_token=True`."
+                    f"{pretrained_model__name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
+                    " login` and pass `use_auth_token=True`."
                 )
             except RevisionNotFoundError:
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
                     " this model name. Check the model page at"
-                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                    f" 'https://huggingface.co/{pretrained_model__name_or_path}' for available revisions."
                 )
             except EntryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
+                    f"{pretrained_model__name_or_path} does not appear to have a file named {cls.config_name}."
                 )
             except HTTPError as err:
                 raise EnvironmentError(
                     "There was a specific connection error when trying to load"
-                    f" {pretrained_model_name_or_path}:\n{err}"
+                    f" {pretrained_model__name_or_path}:\n{err}"
                 )
             except ValueError:
                 raise EnvironmentError(
                     f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" in the cached files and it looks like {pretrained_model__name_or_path} is not the path to a"
                     f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
                     " run the library in offline mode at"
                     " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
                 )
             except EnvironmentError:
                 raise EnvironmentError(
-                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    f"Can't load config for '{pretrained_model__name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"Otherwise, make sure '{pretrained_model__name_or_path}' is the correct path to a directory "
                     f"containing a {cls.config_name} file"
                 )
 
diff --git a/src/diffusers/dynamic_modules_utils.py b/src/diffusers/dynamic_modules_utils.py
index 0ebf916e7af5..58e49e6209f3 100644
--- a/src/diffusers/dynamic_modules_utils.py
+++ b/src/diffusers/dynamic_modules_utils.py
@@ -149,7 +149,7 @@ def get_class_in_module(class_name, module_path):
 
 
 def get_cached_module_file(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
+    pretrained_model__name_or_path: Union[str, os.PathLike],
     module_file: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
@@ -164,7 +164,7 @@ def get_cached_module_file(
     Transformers module.
 
     Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
+        pretrained_model__name_or_path (`str` or `os.PathLike`):
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
@@ -205,9 +205,9 @@ def get_cached_module_file(
     Returns:
         `str`: The path to the module inside the cache.
     """
-    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
-    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
+    # Download and cache module_file from the repo `pretrained_model__name_or_path` of grab it if it's a local file.
+    pretrained_model__name_or_path = str(pretrained_model__name_or_path)
+    module_file_or_url = os.path.join(pretrained_model__name_or_path, module_file)
     submodule = "local"
 
     if os.path.isfile(module_file_or_url):
@@ -226,7 +226,7 @@ def get_cached_module_file(
             )
 
         except EnvironmentError:
-            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model__name_or_path}.")
             raise
 
     # Check we have all the requirements in our environment
@@ -242,12 +242,12 @@ def get_cached_module_file(
     shutil.copy(resolved_module_file, submodule_path / module_file)
     for module_needed in modules_needed:
         module_needed = f"{module_needed}.py"
-        shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
+        shutil.copy(os.path.join(pretrained_model__name_or_path, module_needed), submodule_path / module_needed)
     return os.path.join(full_submodule, module_file)
 
 
 def get_class_from_dynamic_module(
-    pretrained_model_name_or_path: Union[str, os.PathLike],
+    pretrained_model__name_or_path: Union[str, os.PathLike],
     module_file: str,
     class_name: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
@@ -270,7 +270,7 @@ def get_class_from_dynamic_module(
     </Tip>
 
     Args:
-        pretrained_model_name_or_path (`str` or `os.PathLike`):
+        pretrained_model__name_or_path (`str` or `os.PathLike`):
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
@@ -322,7 +322,7 @@ def get_class_from_dynamic_module(
     ```"""
     # And lastly we get the class inside our newly created module
     final_module = get_cached_module_file(
-        pretrained_model_name_or_path,
+        pretrained_model__name_or_path,
         module_file,
         cache_dir=cache_dir,
         force_download=force_download,
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 1380f16d1df0..f30cbda6ae8e 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -147,7 +147,7 @@ class ModelMixin(torch.nn.Module):
           models, `pixel_values` for vision models and `input_values` for speech models).
     """
     config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "name_or_path"]
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
 
     def __init__(self):
         super().__init__()
@@ -207,7 +207,7 @@ def save_pretrained(
         logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+    def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
@@ -222,7 +222,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         weights are discarded.
 
         Parameters:
-            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
+            pretrained_model__name_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
@@ -244,17 +244,17 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                       model).
                     - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the save
                       directory.
-                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
+                    - The model is loaded by supplying a local directory as `pretrained_model__name_or_path` and a
                       configuration JSON file named *config.json* is found in the directory.
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
             from_tf (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
+                `pretrained_model__name_or_path` argument).
             from_flax (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a Flax checkpoint save file (see docstring of
-                `pretrained_model_name_or_path` argument).
+                `pretrained_model__name_or_path` argument).
             ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                 Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                 as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
@@ -327,7 +327,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
         user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
 
         # Load config if we don't provide a configuration
-        config_path = pretrained_model_name_or_path
+        config_path = pretrained_model__name_or_path
         model, unused_kwargs = cls.from_config(
             config_path,
             cache_dir=cache_dir,
@@ -341,27 +341,27 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             subfolder=subfolder,
             **kwargs,
         )
-        model.register_to_config(name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(_name_or_path=pretrained_model__name_or_path)
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # Load model
-        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
-        if os.path.isdir(pretrained_model_name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
+        pretrained_model__name_or_path = str(pretrained_model__name_or_path)
+        if os.path.isdir(pretrained_model__name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model__name_or_path, WEIGHTS_NAME)):
                 # Load from a PyTorch checkpoint
-                model_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
+                model_file = os.path.join(pretrained_model__name_or_path, WEIGHTS_NAME)
             elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
+                os.path.join(pretrained_model__name_or_path, subfolder, WEIGHTS_NAME)
             ):
-                model_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
+                model_file = os.path.join(pretrained_model__name_or_path, subfolder, WEIGHTS_NAME)
             else:
                 raise EnvironmentError(
-                    f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
+                    f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model__name_or_path}."
                 )
         else:
             try:
                 # Load from URL or cache if already cached
                 model_file = hf_hub_download(
-                    pretrained_model_name_or_path,
+                    pretrained_model__name_or_path,
                     filename=WEIGHTS_NAME,
                     cache_dir=cache_dir,
                     force_download=force_download,
@@ -375,7 +375,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
+                    f"{pretrained_model__name_or_path} is not a local folder and is not a valid model identifier "
                     "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                     "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
                     "login` and pass `use_auth_token=True`."
@@ -384,30 +384,30 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                     "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
+                    f"'https://huggingface.co/{pretrained_model__name_or_path}' for available revisions."
                 )
             except EntryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} does not appear to have a file named {model_file}."
+                    f"{pretrained_model__name_or_path} does not appear to have a file named {model_file}."
                 )
             except HTTPError as err:
                 raise EnvironmentError(
                     "There was a specific connection error when trying to load"
-                    f" {pretrained_model_name_or_path}:\n{err}"
+                    f" {pretrained_model__name_or_path}:\n{err}"
                 )
             except ValueError:
                 raise EnvironmentError(
                     f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
+                    f" in the cached files and it looks like {pretrained_model__name_or_path} is not the path to a"
                     f" directory containing a file named {WEIGHTS_NAME} or"
                     " \nCheckout your internet connection or see how to run the library in"
                     " offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
                 )
             except EnvironmentError:
                 raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
+                    f"Can't load the model for '{pretrained_model__name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
+                    f"Otherwise, make sure '{pretrained_model__name_or_path}' is the correct path to a directory "
                     f"containing a file named {WEIGHTS_NAME}"
                 )
 
@@ -417,7 +417,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             model,
             state_dict,
             model_file,
-            pretrained_model_name_or_path,
+            pretrained_model__name_or_path,
             ignore_mismatched_sizes=ignore_mismatched_sizes,
         )
 
@@ -441,7 +441,7 @@ def _load_pretrained_model(
         model,
         state_dict,
         resolved_archive_file,
-        pretrained_model_name_or_path,
+        pretrained_model__name_or_path,
         ignore_mismatched_sizes=False,
     ):
         # Retrieve missing & unexpected_keys
@@ -500,7 +500,7 @@ def _find_mismatched_keys(
         if False:
             if len(unexpected_keys) > 0:
                 logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                    f"Some weights of the model checkpoint at {pretrained_model__name_or_path} were not used when"
                     f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
                     f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
                     " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
@@ -514,13 +514,13 @@ def _find_mismatched_keys(
             if len(missing_keys) > 0:
                 logger.warning(
                     f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                    f" {pretrained_model__name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
                     " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
                 )
             elif len(mismatched_keys) == 0:
                 logger.info(
                     f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                    f" {pretrained_model__name_or_path}.\nIf your task is similar to the task the model of the"
                     f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
                     " without further training."
                 )
@@ -533,7 +533,7 @@ def _find_mismatched_keys(
                 )
                 logger.warning(
                     f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                    f" {pretrained_model__name_or_path} and are newly initialized because the shapes did not"
                     f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
                     " able to use it for predictions and inference."
                 )
diff --git a/src/diffusers/models/attention.py b/src/diffusers/models/attention.py
index 795eca7f63f1..dd22cdbb9512 100644
--- a/src/diffusers/models/attention.py
+++ b/src/diffusers/models/attention.py
@@ -17,7 +17,6 @@ class AttentionBlockNew(nn.Module):
     def __init__(
         self,
         channels,
-        num_heads=1,
         num_head_channels=None,
         num_groups=32,
         rescale_output_factor=1.0,
@@ -25,14 +24,8 @@ def __init__(
     ):
         super().__init__()
         self.channels = channels
-        if num_head_channels is None:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
 
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
         self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)
 
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 72de88e046df..06e55a1e1459 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -40,7 +40,6 @@ def __init__(
         in_channels=None,
         out_channels=None,
         num_res_blocks=None,
-        dropout=0,
         block_channels=(224, 448, 672, 896),
         down_blocks=(
             "UNetResDownBlock2D",
@@ -52,7 +51,6 @@ def __init__(
         up_blocks=("UNetResAttnUpBlock2D", "UNetResAttnUpBlock2D", "UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
         resnet_act_fn="silu",
         resnet_eps=1e-5,
-        conv_resample=True,
         num_head_channels=32,
         flip_sin_to_cos=True,
         downscale_freq_shift=0,
@@ -106,7 +104,6 @@ def __init__(
         # mid
         self.mid = UNetMidBlock2D(
             in_channels=block_channels[-1],
-            dropout=dropout,
             temb_channels=time_embed_dim,
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index ee593b463210..1fd4ff656718 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -112,7 +112,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
             save_method(os.path.join(save_directory, pipeline_component_name))
 
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+    def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Add docstrings
         """
@@ -125,9 +125,9 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
 
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
-        if not os.path.isdir(pretrained_model_name_or_path):
+        if not os.path.isdir(pretrained_model__name_or_path):
             cached_folder = snapshot_download(
-                pretrained_model_name_or_path,
+                pretrained_model__name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
                 proxies=proxies,
@@ -136,7 +136,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
                 revision=revision,
             )
         else:
-            cached_folder = pretrained_model_name_or_path
+            cached_folder = pretrained_model__name_or_path
 
         config_dict = cls.get_config_dict(cached_folder)
 
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 5f9227c9cb73..9b2009796ff7 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -28,7 +28,9 @@ def __init__(self, unet, scheduler):
         self.register_modules(unet=unet, scheduler=scheduler)
 
     @torch.no_grad()
-    def __call__(self, batch_size=1, generator=None, torch_device=None, eta=0.0, num_inference_steps=50, output_type="pil"):
+    def __call__(
+        self, batch_size=1, generator=None, torch_device=None, eta=0.0, num_inference_steps=50, output_type="pil"
+    ):
         # eta corresponds to η in paper and should be between [0, 1]
         if torch_device is None:
             torch_device = "cuda" if torch.cuda.is_available() else "cpu"
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 7d9378e0171e..6a439a92aaa0 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -78,7 +78,7 @@ def test_register_to_config(self):
         assert config["e"] == [1, 3]
 
         # init ignore private arguments
-        obj = SampleObject(_name_or_path="lalala")
+        obj = SampleObject(__name_or_path="lalala")
         config = obj.config
         assert config["a"] == 2
         assert config["b"] == 5
@@ -736,7 +736,7 @@ def test_from_pretrained_hub(self):
 
     @slow
     def test_output_format(self):
-        model_path = "google/ddpm-cifar10-32"
+        model_path = "/home/patrick/google_checkpoints/ddpm-cifar10-32"
 
         pipe = DDIMPipeline.from_pretrained(model_path)
 

From 4ce0a6c10625be4fe2155ee7c2c2ff47f939b7ea Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 17:28:31 +0000
Subject: [PATCH 07/12] up

---
 _ | 82 ---------------------------------------------------------------
 1 file changed, 82 deletions(-)
 delete mode 100644 _

diff --git a/_ b/_
deleted file mode 100644
index cf1443d4261f..000000000000
--- a/_
+++ /dev/null
@@ -1,82 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the LDM checkpoints. """
-
-import argparse
-import os
-import json
-import torch
-from diffusers import UNet2DModel
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--repo_path",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the architecture.",
-    )
-
-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
-
-    args = parser.parse_args()
-
-    config_parameters_to_change = {
-
-    }
-
-    key_parameters_to_change = {
-    
-    }
-
-    model = UNet2DModel.from_config(args.repo_path)
-    config = dict(model.config)
-
-    for key, value in config_parameters_to_change.items():
-        if key in config:
-            if isinstance(value, dict):
-                new_list = []
-
-                for block_name in config[key]:
-                    # map old block name to new one
-                    new_list.append(value[block_name])
-            else:
-                config[key] = value
-
-    state_dict = torch.load(os.path.join(args.repo_path, "diffusion_pytorch_model.bin"))
-
-    new_state_dict = {}
-    for key, new_key in key_parameters_to_change.items():
-        for param_key, param_value in state_dict.items():
-            if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
-                continue
-            else:
-                new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
-
-    model.load_state_dict(state_dict)
-
-    try:
-        scheduler = DDPMScheduler.from_config("/".join(args.checkpoint_path.split("/")[:-1]))
-        vqvae = VQModel.from_pretrained("/".join(args.checkpoint_path.split("/")[:-1]))
-
-        pipe = LatentDiffusionUncondPipeline(unet=model, scheduler=scheduler, vae=vqvae)
-        pipe.save_pretrained(args.dump_path)
-    except:
-        model.save_pretrained(args.dump_path)

From 1d44711ef909e4e95f026e7115af97b81d2ef240 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 21:01:17 +0000
Subject: [PATCH 08/12] save checkpoint

---
 scripts/change_configs.py                     |  36 +++---
 scripts/generate_logits.py                    |   5 +-
 src/diffusers/models/unet_2d.py               | 111 +++++++++---------
 src/diffusers/models/unet_2d_condition.py     | 107 ++++++++---------
 src/diffusers/models/unet_blocks.py           |  62 +++++-----
 src/diffusers/pipelines/ddim/pipeline_ddim.py |   2 +-
 src/diffusers/pipelines/ddpm/pipeline_ddpm.py |   2 +-
 .../pipeline_latent_diffusion.py              |   2 +-
 .../pipeline_latent_diffusion_uncond.py       |   2 +-
 src/diffusers/pipelines/pndm/pipeline_pndm.py |   2 +-
 .../score_sde_ve/pipeline_score_sde_ve.py     |   2 +-
 tests/test_modeling_utils.py                  |  13 +-
 12 files changed, 172 insertions(+), 174 deletions(-)

diff --git a/scripts/change_configs.py b/scripts/change_configs.py
index 191f607f48bb..c8d714dbe9a8 100644
--- a/scripts/change_configs.py
+++ b/scripts/change_configs.py
@@ -17,7 +17,8 @@
 import argparse
 import os
 import torch
-from diffusers import UNet2DModel
+from diffusers import UNet2DModel, UNet2DConditionModel
+from transformers.file_utils import has_file
 
 
 if __name__ == "__main__":
@@ -42,7 +43,7 @@
         "num_res_blocks": "layers_per_block",
         "block_channels": "block_out_channels",
         "downscale_freq_shift": "freq_shift",
-        "resnet_num_groups": "num_groups_norm",
+        "resnet_num_groups": "norm_num_groups",
         "resnet_act_fn": "act_fn",
         "resnet_eps": "norm_eps",
         "num_head_channels": "attention_head_dim",
@@ -55,26 +56,30 @@
         "upsample_blocks": "up_blocks",
     }
 
-    model = UNet2DModel.from_config(args.repo_path)
+    if has_file(args.repo_path, "config.json"):
+        model = UNet2DModel.from_config(args.repo_path)
+        subfolder = ""
+    else:
+        subfolder = "unet"
+        class_name = UNet2DConditionModel if "ldm-text2im-large-256" in args.repo_path else UNet2DModel
+        model = class_name.from_config(args.repo_path, subfolder=subfolder)
+
     config = dict(model.config)
 
     for key, value in config_parameters_to_change.items():
         if key in config:
-            if isinstance(value, dict):
-                new_list = []
-
-                for block_name in config[key]:
-                    # map old block name to new one
-                    new_list.append(value[block_name])
-            else:
-                config[key] = value
+            config[value] = config[key]
+            del config[key]
 
     config["down_blocks"] = [k.replace("UNetRes", "") for k in config["down_blocks"]]
     config["up_blocks"] = [k.replace("UNetRes", "") for k in config["up_blocks"]]
 
-    model = UNet2DModel(**config)
+    if has_file(args.repo_path, "config.json"):
+        model = UNet2DModel(**config)
+    else:
+        model = UNet2DConditionModel(**config)
 
-    state_dict = torch.load(os.path.join(args.repo_path, "diffusion_pytorch_model.bin"))
+    state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
 
     new_state_dict = {}
     for key, new_key in key_parameters_to_change.items():
@@ -85,4 +90,7 @@
                 new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
 
     model.load_state_dict(state_dict)
-    model.save_pretrained(args.repo_path + "_new")
+    if has_file(args.repo_path, "config.json"):
+        model.save_pretrained(args.repo_path)
+    else:
+        model.save_pretrained(os.path.join(args.repo_path, "unet"))
diff --git a/scripts/generate_logits.py b/scripts/generate_logits.py
index 93a94b7704c0..d586c4994d8e 100644
--- a/scripts/generate_logits.py
+++ b/scripts/generate_logits.py
@@ -70,9 +70,10 @@
 models = api.list_models(filter="diffusers")
 for mod in models:
     if "google" in mod.author or mod.modelId == "CompVis/ldm-celebahq-256": 
-            
         local_checkpoint = "/home/patrick/google_checkpoints/" + mod.modelId.split("/")[-1]
 
+        print(f"Started running {mod.modelId}!!!")
+
         if mod.modelId.startswith("CompVis"):
             model = UNet2DModel.from_pretrained(local_checkpoint, subfolder = "unet")
         else: 
@@ -81,7 +82,7 @@
         torch.manual_seed(0)
         random.seed(0)
         
-        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
         time_step = torch.tensor([10] * noise.shape[0])
         with torch.no_grad():
             logits = model(noise, time_step)['sample']
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 06e55a1e1459..72f940dd0b8f 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -10,28 +10,6 @@
 
 
 class UNet2DModel(ModelMixin, ConfigMixin):
-    """
-    The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
-    model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
-    num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample
-    rates at which
-        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
-        downsampling, attention will be used.
-    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
-    conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
-    model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
-    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
-    for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
 
     @register_to_config
     def __init__(
@@ -42,13 +20,13 @@ def __init__(
         num_res_blocks=None,
         block_channels=(224, 448, 672, 896),
         down_blocks=(
-            "UNetResDownBlock2D",
-            "UNetResAttnDownBlock2D",
-            "UNetResAttnDownBlock2D",
-            "UNetResAttnDownBlock2D",
+            "DownBlock2D",
+            "AttnDownBlock2D",
+            "AttnDownBlock2D",
+            "AttnDownBlock2D",
         ),
         downsample_padding=1,
-        up_blocks=("UNetResAttnUpBlock2D", "UNetResAttnUpBlock2D", "UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
+        up_blocks=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
         resnet_act_fn="silu",
         resnet_eps=1e-5,
         num_head_channels=32,
@@ -58,21 +36,38 @@ def __init__(
         mid_block_scale_factor=1,
         center_input_sample=False,
         resnet_num_groups=32,
+        sample_size = None,
+        layers_per_block = None,
+        block_out_channels = None,
+        freq_shift = None,
+        norm_num_groups = None,
+        act_fn = None,
+        norm_eps = None,
+        attention_head_dim = None,
     ):
         super().__init__()
-        self.image_size = image_size
-        time_embed_dim = block_channels[0] * 4
+        sample_size = sample_size or image_size
+        layers_per_block = layers_per_block or num_res_blocks
+        block_out_channels = block_out_channels or block_channels
+        freq_shift = freq_shift or downscale_freq_shift
+        norm_num_groups = norm_num_groups or resnet_num_groups
+        act_fn = act_fn or resnet_act_fn
+        norm_eps = norm_eps or resnet_eps
+        attention_head_dim = attention_head_dim or num_head_channels
+
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
 
         # input
-        self.conv_in = nn.Conv2d(in_channels, block_channels[0], kernel_size=3, padding=(1, 1))
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
 
         # time
         if time_embedding_type == "fourier":
-            self.time_steps = GaussianFourierProjection(embedding_size=block_channels[0], scale=16)
-            timestep_input_dim = 2 * block_channels[0]
+            self.time_steps = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            timestep_input_dim = 2 * block_out_channels[0]
         elif time_embedding_type == "positional":
-            self.time_steps = Timesteps(block_channels[0], flip_sin_to_cos, downscale_freq_shift)
-            timestep_input_dim = block_channels[0]
+            self.time_steps = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
@@ -81,68 +76,68 @@ def __init__(
         self.upsample_blocks = nn.ModuleList([])
 
         # down
-        output_channel = block_channels[0]
+        output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_blocks):
             input_channel = output_channel
-            output_channel = block_channels[i]
-            is_final_block = i == len(block_channels) - 1
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
 
             down_block = get_down_block(
                 down_block_type,
-                num_layers=num_res_blocks,
+                num_layers=layers_per_block,
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=time_embed_dim,
                 add_downsample=not is_final_block,
-                resnet_eps=resnet_eps,
-                resnet_act_fn=resnet_act_fn,
-                attn_num_head_channels=num_head_channels,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
             )
             self.downsample_blocks.append(down_block)
 
         # mid
         self.mid = UNetMidBlock2D(
-            in_channels=block_channels[-1],
+            in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
-            resnet_eps=resnet_eps,
-            resnet_act_fn=resnet_act_fn,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
-            attn_num_head_channels=num_head_channels,
-            resnet_groups=resnet_num_groups,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
         )
 
         # up
-        reversed_block_channels = list(reversed(block_channels))
-        output_channel = reversed_block_channels[0]
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_blocks):
             prev_output_channel = output_channel
-            output_channel = reversed_block_channels[i]
-            input_channel = reversed_block_channels[min(i + 1, len(block_channels) - 1)]
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
-            is_final_block = i == len(block_channels) - 1
+            is_final_block = i == len(block_out_channels) - 1
 
             up_block = get_up_block(
                 up_block_type,
-                num_layers=num_res_blocks + 1,
+                num_layers=layers_per_block + 1,
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
                 temb_channels=time_embed_dim,
                 add_upsample=not is_final_block,
-                resnet_eps=resnet_eps,
-                resnet_act_fn=resnet_act_fn,
-                attn_num_head_channels=num_head_channels,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=attention_head_dim,
             )
             self.upsample_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        num_groups_out = resnet_num_groups if resnet_num_groups is not None else min(block_channels[0] // 4, 32)
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_channels[0], num_groups=num_groups_out, eps=resnet_eps)
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
         self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_channels[0], out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
     def forward(
         self, sample: torch.FloatTensor, timestep: Union[torch.Tensor, float, int]
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index f8bd3a120c20..49f70b4e933c 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -10,28 +10,6 @@
 
 
 class UNet2DConditionModel(ModelMixin, ConfigMixin):
-    """
-    The full UNet model with attention and timestep embedding. :param in_channels: channels in the input Tensor. :param
-    model_channels: base channel count for the model. :param out_channels: channels in the output Tensor. :param
-    num_res_blocks: number of residual blocks per downsample. :param attention_resolutions: a collection of downsample
-    rates at which
-        attention will take place. May be a set, list, or tuple. For example, if this contains 4, then at 4x
-        downsampling, attention will be used.
-    :param dropout: the dropout probability. :param channel_mult: channel multiplier for each level of the UNet. :param
-    conv_resample: if True, use learned convolutions for upsampling and
-        downsampling.
-    :param dims: determines if the signal is 1D, 2D, or 3D. :param num_classes: if specified (as an int), then this
-    model will be
-        class-conditional with `num_classes` classes.
-    :param use_checkpoint: use gradient checkpointing to reduce memory usage. :param num_heads: the number of attention
-    heads in each attention layer. :param num_heads_channels: if specified, ignore num_heads and instead use
-                               a fixed channel width per attention head.
-    :param num_heads_upsample: works with num_heads to set a different number
-                               of heads for upsampling. Deprecated.
-    :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. :param resblock_updown: use residual blocks
-    for up/downsampling. :param use_new_attention_order: use a different attention pattern for potentially
-                                    increased efficiency.
-    """
 
     @register_to_config
     def __init__(
@@ -40,20 +18,19 @@ def __init__(
         in_channels=4,
         out_channels=4,
         num_res_blocks=2,
-        dropout=0,
         block_channels=(320, 640, 1280, 1280),
         down_blocks=(
-            "UNetResCrossAttnDownBlock2D",
-            "UNetResCrossAttnDownBlock2D",
-            "UNetResCrossAttnDownBlock2D",
-            "UNetResDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "CrossAttnDownBlock2D",
+            "DownBlock2D",
         ),
         downsample_padding=1,
         up_blocks=(
-            "UNetResUpBlock2D",
-            "UNetResCrossAttnUpBlock2D",
-            "UNetResCrossAttnUpBlock2D",
-            "UNetResCrossAttnUpBlock2D",
+            "UpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
+            "CrossAttnUpBlock2D",
         ),
         resnet_act_fn="silu",
         resnet_eps=1e-5,
@@ -64,17 +41,34 @@ def __init__(
         mid_block_scale_factor=1,
         center_input_sample=False,
         resnet_num_groups=30,
+        sample_size = None,
+        layers_per_block = None,
+        block_out_channels = None,
+        freq_shift = None,
+        norm_num_groups = None,
+        act_fn = None,
+        norm_eps = None,
+        attention_head_dim = None,
     ):
         super().__init__()
-        self.image_size = image_size
-        time_embed_dim = block_channels[0] * 4
+        sample_size = sample_size or image_size
+        layers_per_block = layers_per_block or num_res_blocks
+        block_out_channels = block_out_channels or block_channels
+        freq_shift = freq_shift or downscale_freq_shift
+        norm_num_groups = norm_num_groups or resnet_num_groups
+        act_fn = act_fn or resnet_act_fn
+        norm_eps = norm_eps or resnet_eps
+        attention_head_dim = attention_head_dim or num_head_channels
+
+        self.mage_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
 
         # input
-        self.conv_in = nn.Conv2d(in_channels, block_channels[0], kernel_size=3, padding=(1, 1))
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
 
         # time
-        self.time_steps = Timesteps(block_channels[0], flip_sin_to_cos, downscale_freq_shift)
-        timestep_input_dim = block_channels[0]
+        self.time_steps = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
@@ -83,68 +77,67 @@ def __init__(
         self.upsample_blocks = nn.ModuleList([])
 
         # down
-        output_channel = block_channels[0]
+        output_channel = block_out_channels[0]
         for i, down_block_type in enumerate(down_blocks):
             input_channel = output_channel
-            output_channel = block_channels[i]
-            is_final_block = i == len(block_channels) - 1
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
 
             down_block = get_down_block(
                 down_block_type,
-                num_layers=num_res_blocks,
+                num_layers=layers_per_block,
                 in_channels=input_channel,
                 out_channels=output_channel,
                 temb_channels=time_embed_dim,
                 add_downsample=not is_final_block,
-                resnet_eps=resnet_eps,
+                resnet_eps=norm_eps,
                 resnet_act_fn=resnet_act_fn,
-                attn_num_head_channels=num_head_channels,
+                attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
             )
             self.downsample_blocks.append(down_block)
 
         # mid
         self.mid = UNetMidBlock2DCrossAttn(
-            in_channels=block_channels[-1],
-            dropout=dropout,
+            in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
-            resnet_eps=resnet_eps,
+            resnet_eps=norm_eps,
             resnet_act_fn=resnet_act_fn,
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
-            attn_num_head_channels=num_head_channels,
-            resnet_groups=resnet_num_groups,
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
         )
 
         # up
-        reversed_block_channels = list(reversed(block_channels))
-        output_channel = reversed_block_channels[0]
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
         for i, up_block_type in enumerate(up_blocks):
             prev_output_channel = output_channel
-            output_channel = reversed_block_channels[i]
-            input_channel = reversed_block_channels[min(i + 1, len(block_channels) - 1)]
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
 
-            is_final_block = i == len(block_channels) - 1
+            is_final_block = i == len(block_out_channels) - 1
 
             up_block = get_up_block(
                 up_block_type,
-                num_layers=num_res_blocks + 1,
+                num_layers=layers_per_block + 1,
                 in_channels=input_channel,
                 out_channels=output_channel,
                 prev_output_channel=prev_output_channel,
                 temb_channels=time_embed_dim,
                 add_upsample=not is_final_block,
-                resnet_eps=resnet_eps,
+                resnet_eps=norm_eps,
                 resnet_act_fn=resnet_act_fn,
-                attn_num_head_channels=num_head_channels,
+                attn_num_head_channels=attention_head_dim,
             )
             self.upsample_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
-        self.conv_norm_out = nn.GroupNorm(num_channels=block_channels[0], num_groups=resnet_num_groups, eps=resnet_eps)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
         self.conv_act = nn.SiLU()
-        self.conv_out = nn.Conv2d(block_channels[0], out_channels, 3, padding=1)
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
 
     def forward(
         self,
diff --git a/src/diffusers/models/unet_blocks.py b/src/diffusers/models/unet_blocks.py
index 60ec2f2e06e7..67082d2409bd 100644
--- a/src/diffusers/models/unet_blocks.py
+++ b/src/diffusers/models/unet_blocks.py
@@ -33,8 +33,9 @@ def get_down_block(
     attn_num_head_channels,
     downsample_padding=None,
 ):
-    if down_block_type == "UNetResDownBlock2D":
-        return UNetResDownBlock2D(
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock2D":
+        return DownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -44,8 +45,8 @@ def get_down_block(
             resnet_act_fn=resnet_act_fn,
             downsample_padding=downsample_padding,
         )
-    elif down_block_type == "UNetResAttnDownBlock2D":
-        return UNetResAttnDownBlock2D(
+    elif down_block_type == "AttnDownBlock2D":
+        return AttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -56,8 +57,8 @@ def get_down_block(
             downsample_padding=downsample_padding,
             attn_num_head_channels=attn_num_head_channels,
         )
-    elif down_block_type == "UNetResCrossAttnDownBlock2D":
-        return UNetResCrossAttnDownBlock2D(
+    elif down_block_type == "CrossAttnDownBlock2D":
+        return CrossAttnDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -68,8 +69,8 @@ def get_down_block(
             downsample_padding=downsample_padding,
             attn_num_head_channels=attn_num_head_channels,
         )
-    elif down_block_type == "UNetResSkipDownBlock2D":
-        return UNetResSkipDownBlock2D(
+    elif down_block_type == "SkipDownBlock2D":
+        return SkipDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -79,8 +80,8 @@ def get_down_block(
             resnet_act_fn=resnet_act_fn,
             downsample_padding=downsample_padding,
         )
-    elif down_block_type == "UNetResAttnSkipDownBlock2D":
-        return UNetResAttnSkipDownBlock2D(
+    elif down_block_type == "AttnSkipDownBlock2D":
+        return AttnSkipDownBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -105,8 +106,9 @@ def get_up_block(
     resnet_act_fn,
     attn_num_head_channels,
 ):
-    if up_block_type == "UNetResUpBlock2D":
-        return UNetResUpBlock2D(
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock2D":
+        return UpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -116,8 +118,8 @@ def get_up_block(
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
         )
-    elif up_block_type == "UNetResCrossAttnUpBlock2D":
-        return UNetResCrossAttnUpBlock2D(
+    elif up_block_type == "CrossAttnUpBlock2D":
+        return CrossAttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -128,8 +130,8 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             attn_num_head_channels=attn_num_head_channels,
         )
-    elif up_block_type == "UNetResAttnUpBlock2D":
-        return UNetResAttnUpBlock2D(
+    elif up_block_type == "AttnUpBlock2D":
+        return AttnUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -140,8 +142,8 @@ def get_up_block(
             resnet_act_fn=resnet_act_fn,
             attn_num_head_channels=attn_num_head_channels,
         )
-    elif up_block_type == "UNetResSkipUpBlock2D":
-        return UNetResSkipUpBlock2D(
+    elif up_block_type == "SkipUpBlock2D":
+        return SkipUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -151,8 +153,8 @@ def get_up_block(
             resnet_eps=resnet_eps,
             resnet_act_fn=resnet_act_fn,
         )
-    elif up_block_type == "UNetResAttnSkipUpBlock2D":
-        return UNetResAttnSkipUpBlock2D(
+    elif up_block_type == "AttnSkipUpBlock2D":
+        return AttnSkipUpBlock2D(
             num_layers=num_layers,
             in_channels=in_channels,
             out_channels=out_channels,
@@ -322,7 +324,7 @@ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         return hidden_states
 
 
-class UNetResAttnDownBlock2D(nn.Module):
+class AttnDownBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -403,7 +405,7 @@ def forward(self, hidden_states, temb=None):
         return hidden_states, output_states
 
 
-class UNetResCrossAttnDownBlock2D(nn.Module):
+class CrossAttnDownBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -485,7 +487,7 @@ def forward(self, hidden_states, temb=None, encoder_hidden_states=None):
         return hidden_states, output_states
 
 
-class UNetResDownBlock2D(nn.Module):
+class DownBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -551,7 +553,7 @@ def forward(self, hidden_states, temb=None):
         return hidden_states, output_states
 
 
-class UNetResAttnSkipDownBlock2D(nn.Module):
+class AttnSkipDownBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -644,7 +646,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
         return hidden_states, output_states, skip_sample
 
 
-class UNetResSkipDownBlock2D(nn.Module):
+class SkipDownBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -723,7 +725,7 @@ def forward(self, hidden_states, temb=None, skip_sample=None):
         return hidden_states, output_states, skip_sample
 
 
-class UNetResAttnUpBlock2D(nn.Module):
+class AttnUpBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -801,7 +803,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         return hidden_states
 
 
-class UNetResCrossAttnUpBlock2D(nn.Module):
+class CrossAttnUpBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -881,7 +883,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, encoder_hid
         return hidden_states
 
 
-class UNetResUpBlock2D(nn.Module):
+class UpBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -944,7 +946,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None):
         return hidden_states
 
 
-class UNetResAttnSkipUpBlock2D(nn.Module):
+class AttnSkipUpBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
@@ -1055,7 +1057,7 @@ def forward(self, hidden_states, res_hidden_states_tuple, temb=None, skip_sample
         return hidden_states, skip_sample
 
 
-class UNetResSkipUpBlock2D(nn.Module):
+class SkipUpBlock2D(nn.Module):
     def __init__(
         self,
         in_channels: int,
diff --git a/src/diffusers/pipelines/ddim/pipeline_ddim.py b/src/diffusers/pipelines/ddim/pipeline_ddim.py
index 9b2009796ff7..a1000ae2ef25 100644
--- a/src/diffusers/pipelines/ddim/pipeline_ddim.py
+++ b/src/diffusers/pipelines/ddim/pipeline_ddim.py
@@ -39,7 +39,7 @@ def __call__(
 
         # Sample gaussian noise to begin loop
         image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.image_size, self.unet.image_size),
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
         )
         image = image.to(torch_device)
diff --git a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
index a7309224ef25..c947827f014d 100644
--- a/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
+++ b/src/diffusers/pipelines/ddpm/pipeline_ddpm.py
@@ -36,7 +36,7 @@ def __call__(self, batch_size=1, generator=None, torch_device=None, output_type=
 
         # Sample gaussian noise to begin loop
         image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.image_size, self.unet.image_size),
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
         )
         image = image.to(torch_device)
diff --git a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
index 5b3c5dc8cbb1..e6b209026422 100644
--- a/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
+++ b/src/diffusers/pipelines/latent_diffusion/pipeline_latent_diffusion.py
@@ -52,7 +52,7 @@ def __call__(
         text_embeddings = self.bert(text_input.input_ids.to(torch_device))
 
         latents = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.image_size, self.unet.image_size),
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
         )
         latents = latents.to(torch_device)
diff --git a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
index 0964225e8ba9..5445c44cd73c 100644
--- a/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
+++ b/src/diffusers/pipelines/latent_diffusion_uncond/pipeline_latent_diffusion_uncond.py
@@ -24,7 +24,7 @@ def __call__(
         self.vqvae.to(torch_device)
 
         latents = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.image_size, self.unet.image_size),
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
         )
         latents = latents.to(torch_device)
diff --git a/src/diffusers/pipelines/pndm/pipeline_pndm.py b/src/diffusers/pipelines/pndm/pipeline_pndm.py
index d98699dd2506..17f34e4f045f 100644
--- a/src/diffusers/pipelines/pndm/pipeline_pndm.py
+++ b/src/diffusers/pipelines/pndm/pipeline_pndm.py
@@ -38,7 +38,7 @@ def __call__(self, batch_size=1, generator=None, torch_device=None, num_inferenc
 
         # Sample gaussian noise to begin loop
         image = torch.randn(
-            (batch_size, self.unet.in_channels, self.unet.image_size, self.unet.image_size),
+            (batch_size, self.unet.in_channels, self.unet.sample_size, self.unet.sample_size),
             generator=generator,
         )
         image = image.to(torch_device)
diff --git a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
index 5b3be8b66fad..ba8fbd762c40 100644
--- a/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
+++ b/src/diffusers/pipelines/score_sde_ve/pipeline_score_sde_ve.py
@@ -14,7 +14,7 @@ def __init__(self, model, scheduler):
     def __call__(self, num_inference_steps=2000, generator=None, output_type="pil"):
         device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 
-        img_size = self.model.config.image_size
+        img_size = self.model.config.sample_size
         shape = (1, 3, img_size, img_size)
 
         model = self.model.to(device)
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index 6a439a92aaa0..c1e1107f81d3 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -301,7 +301,7 @@ def prepare_init_args_and_inputs_for_common(self):
             "out_channels": 3,
             "in_channels": 3,
             "num_res_blocks": 2,
-            "image_size": 32,
+            "sample_size": 32,
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
@@ -316,7 +316,7 @@ def prepare_init_args_and_inputs_for_common(self):
 #        if torch.cuda.is_available():
 #            torch.cuda.manual_seed_all(0)
 #
-#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+#        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
 #        time_step = torch.tensor([10])
 #
 #        with torch.no_grad():
@@ -353,7 +353,7 @@ def output_shape(self):
 
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
-            "image_size": 32,
+            "sample_size": 32,
             "in_channels": 4,
             "out_channels": 4,
             "num_res_blocks": 2,
@@ -387,7 +387,7 @@ def test_output_pretrained(self):
         if torch.cuda.is_available():
             torch.cuda.manual_seed_all(0)
 
-        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
         time_step = torch.tensor([10] * noise.shape[0])
 
         with torch.no_grad():
@@ -410,7 +410,7 @@ def test_output_pretrained(self):
 #        if torch.cuda.is_available():
 #            torch.cuda.manual_seed_all(0)
 #
-#        noise = torch.randn(1, model.config.in_channels, model.config.image_size, model.config.image_size)
+#        noise = torch.randn(1, model.config.in_channels, model.config.sample_size, model.config.sample_size)
 #        context = torch.ones((1, 16, 64), dtype=torch.float32)
 #        time_step = torch.tensor([10] * noise.shape[0])
 #
@@ -572,7 +572,6 @@ def prepare_init_args_and_inputs_for_common(self):
             "embed_dim": 3,
             "sane_index_shape": False,
             "ch_mult": (1,),
-            "dropout": 0.0,
             "double_z": False,
         }
         inputs_dict = self.dummy_input
@@ -694,7 +693,7 @@ def test_from_pretrained_save_pretrained(self):
         model = UNet2DModel(
             block_channels=(32, 64),
             num_res_blocks=2,
-            image_size=32,
+            sample_size=32,
             in_channels=3,
             out_channels=3,
             down_blocks=("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),

From 3970c1c9f2abe758f7468ebe8e2236e533eca5ec Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 21:59:48 +0000
Subject: [PATCH 09/12] finish api / naming

---
 src/diffusers/configuration_utils.py      | 40 +++++++--------
 src/diffusers/dynamic_modules_utils.py    | 20 ++++----
 src/diffusers/modeling_utils.py           | 56 ++++++++++-----------
 src/diffusers/models/unet_2d.py           | 51 ++++++-------------
 src/diffusers/models/unet_2d_condition.py | 61 +++++++----------------
 src/diffusers/pipeline_utils.py           |  8 +--
 tests/test_modeling_utils.py              | 56 ++++++++++-----------
 7 files changed, 121 insertions(+), 171 deletions(-)

diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index ed5406a84ada..e676f9fad6e4 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -95,9 +95,9 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
 
     @classmethod
     def from_config(
-        cls, pretrained_model__name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
     ):
-        config_dict = cls.get_config_dict(pretrained_model__name_or_path=pretrained_model__name_or_path, **kwargs)
+        config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
 
         init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
 
@@ -110,7 +110,7 @@ def from_config(
 
     @classmethod
     def get_config_dict(
-        cls, pretrained_model__name_or_path: Union[str, os.PathLike], **kwargs
+        cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs
     ) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         cache_dir = kwargs.pop("cache_dir", DIFFUSERS_CACHE)
         force_download = kwargs.pop("force_download", False)
@@ -123,7 +123,7 @@ def get_config_dict(
 
         user_agent = {"file_type": "config"}
 
-        pretrained_model__name_or_path = str(pretrained_model__name_or_path)
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
 
         if cls.config_name is None:
             raise ValueError(
@@ -131,25 +131,25 @@ def get_config_dict(
                 "`ConfigMixin`. Please make sure to define `config_name` in a class inheriting from `ConfigMixin`"
             )
 
-        if os.path.isfile(pretrained_model__name_or_path):
-            config_file = pretrained_model__name_or_path
-        elif os.path.isdir(pretrained_model__name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model__name_or_path, cls.config_name)):
+        if os.path.isfile(pretrained_model_name_or_path):
+            config_file = pretrained_model_name_or_path
+        elif os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, cls.config_name)):
                 # Load from a PyTorch checkpoint
-                config_file = os.path.join(pretrained_model__name_or_path, cls.config_name)
+                config_file = os.path.join(pretrained_model_name_or_path, cls.config_name)
             elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model__name_or_path, subfolder, cls.config_name)
+                os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
             ):
-                config_file = os.path.join(pretrained_model__name_or_path, subfolder, cls.config_name)
+                config_file = os.path.join(pretrained_model_name_or_path, subfolder, cls.config_name)
             else:
                 raise EnvironmentError(
-                    f"Error no file named {cls.config_name} found in directory {pretrained_model__name_or_path}."
+                    f"Error no file named {cls.config_name} found in directory {pretrained_model_name_or_path}."
                 )
         else:
             try:
                 # Load from URL or cache if already cached
                 config_file = hf_hub_download(
-                    pretrained_model__name_or_path,
+                    pretrained_model_name_or_path,
                     filename=cls.config_name,
                     cache_dir=cache_dir,
                     force_download=force_download,
@@ -163,7 +163,7 @@ def get_config_dict(
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model__name_or_path} is not a local folder and is not a valid model identifier"
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
                     " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
                     " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
                     " login` and pass `use_auth_token=True`."
@@ -172,30 +172,30 @@ def get_config_dict(
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for"
                     " this model name. Check the model page at"
-                    f" 'https://huggingface.co/{pretrained_model__name_or_path}' for available revisions."
+                    f" 'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
                 )
             except EntryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model__name_or_path} does not appear to have a file named {cls.config_name}."
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {cls.config_name}."
                 )
             except HTTPError as err:
                 raise EnvironmentError(
                     "There was a specific connection error when trying to load"
-                    f" {pretrained_model__name_or_path}:\n{err}"
+                    f" {pretrained_model_name_or_path}:\n{err}"
                 )
             except ValueError:
                 raise EnvironmentError(
                     f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model__name_or_path} is not the path to a"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
                     f" directory containing a {cls.config_name} file.\nCheckout your internet connection or see how to"
                     " run the library in offline mode at"
                     " 'https://huggingface.co/docs/diffusers/installation#offline-mode'."
                 )
             except EnvironmentError:
                 raise EnvironmentError(
-                    f"Can't load config for '{pretrained_model__name_or_path}'. If you were trying to load it from "
+                    f"Can't load config for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model__name_or_path}' is the correct path to a directory "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                     f"containing a {cls.config_name} file"
                 )
 
diff --git a/src/diffusers/dynamic_modules_utils.py b/src/diffusers/dynamic_modules_utils.py
index 58e49e6209f3..0ebf916e7af5 100644
--- a/src/diffusers/dynamic_modules_utils.py
+++ b/src/diffusers/dynamic_modules_utils.py
@@ -149,7 +149,7 @@ def get_class_in_module(class_name, module_path):
 
 
 def get_cached_module_file(
-    pretrained_model__name_or_path: Union[str, os.PathLike],
+    pretrained_model_name_or_path: Union[str, os.PathLike],
     module_file: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
     force_download: bool = False,
@@ -164,7 +164,7 @@ def get_cached_module_file(
     Transformers module.
 
     Args:
-        pretrained_model__name_or_path (`str` or `os.PathLike`):
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
@@ -205,9 +205,9 @@ def get_cached_module_file(
     Returns:
         `str`: The path to the module inside the cache.
     """
-    # Download and cache module_file from the repo `pretrained_model__name_or_path` of grab it if it's a local file.
-    pretrained_model__name_or_path = str(pretrained_model__name_or_path)
-    module_file_or_url = os.path.join(pretrained_model__name_or_path, module_file)
+    # Download and cache module_file from the repo `pretrained_model_name_or_path` of grab it if it's a local file.
+    pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+    module_file_or_url = os.path.join(pretrained_model_name_or_path, module_file)
     submodule = "local"
 
     if os.path.isfile(module_file_or_url):
@@ -226,7 +226,7 @@ def get_cached_module_file(
             )
 
         except EnvironmentError:
-            logger.error(f"Could not locate the {module_file} inside {pretrained_model__name_or_path}.")
+            logger.error(f"Could not locate the {module_file} inside {pretrained_model_name_or_path}.")
             raise
 
     # Check we have all the requirements in our environment
@@ -242,12 +242,12 @@ def get_cached_module_file(
     shutil.copy(resolved_module_file, submodule_path / module_file)
     for module_needed in modules_needed:
         module_needed = f"{module_needed}.py"
-        shutil.copy(os.path.join(pretrained_model__name_or_path, module_needed), submodule_path / module_needed)
+        shutil.copy(os.path.join(pretrained_model_name_or_path, module_needed), submodule_path / module_needed)
     return os.path.join(full_submodule, module_file)
 
 
 def get_class_from_dynamic_module(
-    pretrained_model__name_or_path: Union[str, os.PathLike],
+    pretrained_model_name_or_path: Union[str, os.PathLike],
     module_file: str,
     class_name: str,
     cache_dir: Optional[Union[str, os.PathLike]] = None,
@@ -270,7 +270,7 @@ def get_class_from_dynamic_module(
     </Tip>
 
     Args:
-        pretrained_model__name_or_path (`str` or `os.PathLike`):
+        pretrained_model_name_or_path (`str` or `os.PathLike`):
             This can be either:
 
             - a string, the *model id* of a pretrained model configuration hosted inside a model repo on
@@ -322,7 +322,7 @@ def get_class_from_dynamic_module(
     ```"""
     # And lastly we get the class inside our newly created module
     final_module = get_cached_module_file(
-        pretrained_model__name_or_path,
+        pretrained_model_name_or_path,
         module_file,
         cache_dir=cache_dir,
         force_download=force_download,
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index f30cbda6ae8e..8d20336312d9 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -207,7 +207,7 @@ def save_pretrained(
         logger.info(f"Model weights saved in {os.path.join(save_directory, WEIGHTS_NAME)}")
 
     @classmethod
-    def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Instantiate a pretrained pytorch model from a pre-trained model configuration.
 
@@ -222,7 +222,7 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
         weights are discarded.
 
         Parameters:
-            pretrained_model__name_or_path (`str` or `os.PathLike`, *optional*):
+            pretrained_model_name_or_path (`str` or `os.PathLike`, *optional*):
                 Can be either:
 
                     - A string, the *model id* of a pretrained model hosted inside a model repo on huggingface.co.
@@ -244,17 +244,17 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
                       model).
                     - The model was saved using [`~ModelMixin.save_pretrained`] and is reloaded by supplying the save
                       directory.
-                    - The model is loaded by supplying a local directory as `pretrained_model__name_or_path` and a
+                    - The model is loaded by supplying a local directory as `pretrained_model_name_or_path` and a
                       configuration JSON file named *config.json* is found in the directory.
             cache_dir (`Union[str, os.PathLike]`, *optional*):
                 Path to a directory in which a downloaded pretrained model configuration should be cached if the
                 standard cache should not be used.
             from_tf (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a TensorFlow checkpoint save file (see docstring of
-                `pretrained_model__name_or_path` argument).
+                `pretrained_model_name_or_path` argument).
             from_flax (`bool`, *optional*, defaults to `False`):
                 Load the model weights from a Flax checkpoint save file (see docstring of
-                `pretrained_model__name_or_path` argument).
+                `pretrained_model_name_or_path` argument).
             ignore_mismatched_sizes (`bool`, *optional*, defaults to `False`):
                 Whether or not to raise an error if some of the weights from the checkpoint do not have the same size
                 as the weights of the model (if for instance, you are instantiating a model with 10 labels from a
@@ -327,7 +327,7 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
         user_agent = {"file_type": "model", "framework": "pytorch", "from_auto_class": from_auto_class}
 
         # Load config if we don't provide a configuration
-        config_path = pretrained_model__name_or_path
+        config_path = pretrained_model_name_or_path
         model, unused_kwargs = cls.from_config(
             config_path,
             cache_dir=cache_dir,
@@ -341,27 +341,27 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
             subfolder=subfolder,
             **kwargs,
         )
-        model.register_to_config(_name_or_path=pretrained_model__name_or_path)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # Load model
-        pretrained_model__name_or_path = str(pretrained_model__name_or_path)
-        if os.path.isdir(pretrained_model__name_or_path):
-            if os.path.isfile(os.path.join(pretrained_model__name_or_path, WEIGHTS_NAME)):
+        pretrained_model_name_or_path = str(pretrained_model_name_or_path)
+        if os.path.isdir(pretrained_model_name_or_path):
+            if os.path.isfile(os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)):
                 # Load from a PyTorch checkpoint
-                model_file = os.path.join(pretrained_model__name_or_path, WEIGHTS_NAME)
+                model_file = os.path.join(pretrained_model_name_or_path, WEIGHTS_NAME)
             elif subfolder is not None and os.path.isfile(
-                os.path.join(pretrained_model__name_or_path, subfolder, WEIGHTS_NAME)
+                os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
             ):
-                model_file = os.path.join(pretrained_model__name_or_path, subfolder, WEIGHTS_NAME)
+                model_file = os.path.join(pretrained_model_name_or_path, subfolder, WEIGHTS_NAME)
             else:
                 raise EnvironmentError(
-                    f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model__name_or_path}."
+                    f"Error no file named {WEIGHTS_NAME} found in directory {pretrained_model_name_or_path}."
                 )
         else:
             try:
                 # Load from URL or cache if already cached
                 model_file = hf_hub_download(
-                    pretrained_model__name_or_path,
+                    pretrained_model_name_or_path,
                     filename=WEIGHTS_NAME,
                     cache_dir=cache_dir,
                     force_download=force_download,
@@ -375,7 +375,7 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model__name_or_path} is not a local folder and is not a valid model identifier "
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier "
                     "listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a "
                     "token having permission to this repo with `use_auth_token` or log in with `huggingface-cli "
                     "login` and pass `use_auth_token=True`."
@@ -384,30 +384,30 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
                 raise EnvironmentError(
                     f"{revision} is not a valid git identifier (branch name, tag name or commit id) that exists for "
                     "this model name. Check the model page at "
-                    f"'https://huggingface.co/{pretrained_model__name_or_path}' for available revisions."
+                    f"'https://huggingface.co/{pretrained_model_name_or_path}' for available revisions."
                 )
             except EntryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model__name_or_path} does not appear to have a file named {model_file}."
+                    f"{pretrained_model_name_or_path} does not appear to have a file named {model_file}."
                 )
             except HTTPError as err:
                 raise EnvironmentError(
                     "There was a specific connection error when trying to load"
-                    f" {pretrained_model__name_or_path}:\n{err}"
+                    f" {pretrained_model_name_or_path}:\n{err}"
                 )
             except ValueError:
                 raise EnvironmentError(
                     f"We couldn't connect to '{HUGGINGFACE_CO_RESOLVE_ENDPOINT}' to load this model, couldn't find it"
-                    f" in the cached files and it looks like {pretrained_model__name_or_path} is not the path to a"
+                    f" in the cached files and it looks like {pretrained_model_name_or_path} is not the path to a"
                     f" directory containing a file named {WEIGHTS_NAME} or"
                     " \nCheckout your internet connection or see how to run the library in"
                     " offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'."
                 )
             except EnvironmentError:
                 raise EnvironmentError(
-                    f"Can't load the model for '{pretrained_model__name_or_path}'. If you were trying to load it from "
+                    f"Can't load the model for '{pretrained_model_name_or_path}'. If you were trying to load it from "
                     "'https://huggingface.co/models', make sure you don't have a local directory with the same name. "
-                    f"Otherwise, make sure '{pretrained_model__name_or_path}' is the correct path to a directory "
+                    f"Otherwise, make sure '{pretrained_model_name_or_path}' is the correct path to a directory "
                     f"containing a file named {WEIGHTS_NAME}"
                 )
 
@@ -417,7 +417,7 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
             model,
             state_dict,
             model_file,
-            pretrained_model__name_or_path,
+            pretrained_model_name_or_path,
             ignore_mismatched_sizes=ignore_mismatched_sizes,
         )
 
@@ -441,7 +441,7 @@ def _load_pretrained_model(
         model,
         state_dict,
         resolved_archive_file,
-        pretrained_model__name_or_path,
+        pretrained_model_name_or_path,
         ignore_mismatched_sizes=False,
     ):
         # Retrieve missing & unexpected_keys
@@ -500,7 +500,7 @@ def _find_mismatched_keys(
         if False:
             if len(unexpected_keys) > 0:
                 logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model__name_or_path} were not used when"
+                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
                     f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
                     f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
                     " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
@@ -514,13 +514,13 @@ def _find_mismatched_keys(
             if len(missing_keys) > 0:
                 logger.warning(
                     f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model__name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                    f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
                     " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
                 )
             elif len(mismatched_keys) == 0:
                 logger.info(
                     f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                    f" {pretrained_model__name_or_path}.\nIf your task is similar to the task the model of the"
+                    f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
                     f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
                     " without further training."
                 )
@@ -533,7 +533,7 @@ def _find_mismatched_keys(
                 )
                 logger.warning(
                     f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model__name_or_path} and are newly initialized because the shapes did not"
+                    f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
                     f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
                     " able to use it for predictions and inference."
                 )
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 72f940dd0b8f..93a35858616e 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -14,46 +14,25 @@ class UNet2DModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
-        image_size=None,
-        in_channels=None,
-        out_channels=None,
-        num_res_blocks=None,
-        block_channels=(224, 448, 672, 896),
-        down_blocks=(
-            "DownBlock2D",
-            "AttnDownBlock2D",
-            "AttnDownBlock2D",
-            "AttnDownBlock2D",
-        ),
-        downsample_padding=1,
-        up_blocks=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
-        resnet_act_fn="silu",
-        resnet_eps=1e-5,
-        num_head_channels=32,
-        flip_sin_to_cos=True,
-        downscale_freq_shift=0,
+        sample_size = None,
+        in_channels=3,
+        out_channels=3,
+        center_input_sample=False,
         time_embedding_type="positional",
+        freq_shift = 0,
+        flip_sin_to_cos=True,
+        down_blocks=("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_blocks=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels=(224, 448, 672, 896),
+        layers_per_block = 2,
         mid_block_scale_factor=1,
-        center_input_sample=False,
-        resnet_num_groups=32,
-        sample_size = None,
-        layers_per_block = None,
-        block_out_channels = None,
-        freq_shift = None,
-        norm_num_groups = None,
-        act_fn = None,
-        norm_eps = None,
-        attention_head_dim = None,
+        downsample_padding=1,
+        act_fn = "silu",
+        attention_head_dim = 8,
+        norm_num_groups = 32,
+        norm_eps = 1e-5,
     ):
         super().__init__()
-        sample_size = sample_size or image_size
-        layers_per_block = layers_per_block or num_res_blocks
-        block_out_channels = block_out_channels or block_channels
-        freq_shift = freq_shift or downscale_freq_shift
-        norm_num_groups = norm_num_groups or resnet_num_groups
-        act_fn = act_fn or resnet_act_fn
-        norm_eps = norm_eps or resnet_eps
-        attention_head_dim = attention_head_dim or num_head_channels
 
         self.sample_size = sample_size
         time_embed_dim = block_out_channels[0] * 4
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index 49f70b4e933c..b8818fac16e8 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -14,53 +14,26 @@ class UNet2DConditionModel(ModelMixin, ConfigMixin):
     @register_to_config
     def __init__(
         self,
-        image_size=None,
+        sample_size = None,
         in_channels=4,
         out_channels=4,
-        num_res_blocks=2,
-        block_channels=(320, 640, 1280, 1280),
-        down_blocks=(
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "CrossAttnDownBlock2D",
-            "DownBlock2D",
-        ),
-        downsample_padding=1,
-        up_blocks=(
-            "UpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-            "CrossAttnUpBlock2D",
-        ),
-        resnet_act_fn="silu",
-        resnet_eps=1e-5,
-        conv_resample=True,
-        num_head_channels=8,
+        center_input_sample=False,
         flip_sin_to_cos=True,
-        downscale_freq_shift=0,
+        freq_shift = 0,
+        down_blocks=("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+        up_blocks=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        block_out_channels=(320, 640, 1280, 1280),
+        layers_per_block = 2,
+        downsample_padding=1,
         mid_block_scale_factor=1,
-        center_input_sample=False,
-        resnet_num_groups=30,
-        sample_size = None,
-        layers_per_block = None,
-        block_out_channels = None,
-        freq_shift = None,
-        norm_num_groups = None,
-        act_fn = None,
-        norm_eps = None,
-        attention_head_dim = None,
+        act_fn = "silu",
+        norm_num_groups = 32,
+        norm_eps = 1e-5,
+        attention_head_dim = 8,
     ):
         super().__init__()
-        sample_size = sample_size or image_size
-        layers_per_block = layers_per_block or num_res_blocks
-        block_out_channels = block_out_channels or block_channels
-        freq_shift = freq_shift or downscale_freq_shift
-        norm_num_groups = norm_num_groups or resnet_num_groups
-        act_fn = act_fn or resnet_act_fn
-        norm_eps = norm_eps or resnet_eps
-        attention_head_dim = attention_head_dim or num_head_channels
-
-        self.mage_size = sample_size
+
+        self.sample_size = sample_size
         time_embed_dim = block_out_channels[0] * 4
 
         # input
@@ -91,7 +64,7 @@ def __init__(
                 temb_channels=time_embed_dim,
                 add_downsample=not is_final_block,
                 resnet_eps=norm_eps,
-                resnet_act_fn=resnet_act_fn,
+                resnet_act_fn=act_fn,
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
             )
@@ -102,7 +75,7 @@ def __init__(
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
             resnet_eps=norm_eps,
-            resnet_act_fn=resnet_act_fn,
+            resnet_act_fn=act_fn,
             output_scale_factor=mid_block_scale_factor,
             resnet_time_scale_shift="default",
             attn_num_head_channels=attention_head_dim,
@@ -128,7 +101,7 @@ def __init__(
                 temb_channels=time_embed_dim,
                 add_upsample=not is_final_block,
                 resnet_eps=norm_eps,
-                resnet_act_fn=resnet_act_fn,
+                resnet_act_fn=act_fn,
                 attn_num_head_channels=attention_head_dim,
             )
             self.upsample_blocks.append(up_block)
diff --git a/src/diffusers/pipeline_utils.py b/src/diffusers/pipeline_utils.py
index 1fd4ff656718..ee593b463210 100644
--- a/src/diffusers/pipeline_utils.py
+++ b/src/diffusers/pipeline_utils.py
@@ -112,7 +112,7 @@ def save_pretrained(self, save_directory: Union[str, os.PathLike]):
             save_method(os.path.join(save_directory, pipeline_component_name))
 
     @classmethod
-    def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], **kwargs):
         r"""
         Add docstrings
         """
@@ -125,9 +125,9 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
 
         # 1. Download the checkpoints and configs
         # use snapshot download here to get it working from from_pretrained
-        if not os.path.isdir(pretrained_model__name_or_path):
+        if not os.path.isdir(pretrained_model_name_or_path):
             cached_folder = snapshot_download(
-                pretrained_model__name_or_path,
+                pretrained_model_name_or_path,
                 cache_dir=cache_dir,
                 resume_download=resume_download,
                 proxies=proxies,
@@ -136,7 +136,7 @@ def from_pretrained(cls, pretrained_model__name_or_path: Optional[Union[str, os.
                 revision=revision,
             )
         else:
-            cached_folder = pretrained_model__name_or_path
+            cached_folder = pretrained_model_name_or_path
 
         config_dict = cls.get_config_dict(cached_folder)
 
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index c1e1107f81d3..f4d1f20d8c52 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -13,7 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-
 import inspect
 import math
 import tempfile
@@ -78,7 +77,7 @@ def test_register_to_config(self):
         assert config["e"] == [1, 3]
 
         # init ignore private arguments
-        obj = SampleObject(__name_or_path="lalala")
+        obj = SampleObject(_name_or_path="lalala")
         config = obj.config
         assert config["a"] == 2
         assert config["b"] == 5
@@ -294,13 +293,13 @@ def output_shape(self):
 
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
-            "block_channels": (32, 64),
-            "down_blocks": ("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
-            "up_blocks": ("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
-            "num_head_channels": None,
+            "block_out_channels": (32, 64),
+            "down_blocks": ("DownBlock2D", "AttnDownBlock2D"),
+            "up_blocks": ("AttnUpBlock2D", "UpBlock2D"),
+            "attention_head_dim": None,
             "out_channels": 3,
             "in_channels": 3,
-            "num_res_blocks": 2,
+            "layers_per_block": 2,
             "sample_size": 32,
         }
         inputs_dict = self.dummy_input
@@ -356,12 +355,11 @@ def prepare_init_args_and_inputs_for_common(self):
             "sample_size": 32,
             "in_channels": 4,
             "out_channels": 4,
-            "num_res_blocks": 2,
-            "block_channels": (32, 64),
-            "num_head_channels": 32,
-            "conv_resample": True,
-            "down_blocks": ("UNetResDownBlock2D", "UNetResDownBlock2D"),
-            "up_blocks": ("UNetResUpBlock2D", "UNetResUpBlock2D"),
+            "layers_per_block": 2,
+            "block_out_channels": (32, 64),
+            "attention_head_dim": 32,
+            "down_blocks": ("DownBlock2D", "DownBlock2D"),
+            "up_blocks": ("UpBlock2D", "UpBlock2D"),
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
@@ -449,25 +447,25 @@ def output_shape(self):
 
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
-            "block_channels": [32, 64, 64, 64],
+            "block_out_channels": [32, 64, 64, 64],
             "in_channels": 3,
-            "num_res_blocks": 1,
+            "layers_per_block": 1,
             "out_channels": 3,
             "time_embedding_type": "fourier",
-            "resnet_eps": 1e-6,
+            "norm_eps": 1e-6,
             "mid_block_scale_factor": math.sqrt(2.0),
-            "resnet_num_groups": None,
+            "norm_num_groups": None,
             "down_blocks": [
-                "UNetResSkipDownBlock2D",
-                "UNetResAttnSkipDownBlock2D",
-                "UNetResSkipDownBlock2D",
-                "UNetResSkipDownBlock2D",
+                "SkipDownBlock2D",
+                "AttnSkipDownBlock2D",
+                "SkipDownBlock2D",
+                "SkipDownBlock2D",
             ],
             "up_blocks": [
-                "UNetResSkipUpBlock2D",
-                "UNetResSkipUpBlock2D",
-                "UNetResAttnSkipUpBlock2D",
-                "UNetResSkipUpBlock2D",
+                "SkipUpBlock2D",
+                "SkipUpBlock2D",
+                "AttnSkipUpBlock2D",
+                "SkipUpBlock2D",
             ],
         }
         inputs_dict = self.dummy_input
@@ -691,13 +689,13 @@ class PipelineTesterMixin(unittest.TestCase):
     def test_from_pretrained_save_pretrained(self):
         # 1. Load models
         model = UNet2DModel(
-            block_channels=(32, 64),
-            num_res_blocks=2,
+            block_out_channels=(32, 64),
+            layers_per_block=2,
             sample_size=32,
             in_channels=3,
             out_channels=3,
-            down_blocks=("UNetResDownBlock2D", "UNetResAttnDownBlock2D"),
-            up_blocks=("UNetResAttnUpBlock2D", "UNetResUpBlock2D"),
+            down_blocks=("DownBlock2D", "AttnDownBlock2D"),
+            up_blocks=("AttnUpBlock2D", "UpBlock2D"),
         )
         schedular = DDPMScheduler(num_train_timesteps=10)
 

From ee82127003b584536a04a56ba9e62174ecd3bbcb Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 22:30:15 +0000
Subject: [PATCH 10/12] finish config renaming

---
 scripts/change_configs.py                 | 72 +++++++++++++----------
 src/diffusers/configuration_utils.py      |  4 +-
 src/diffusers/models/resnet.py            |  2 +-
 src/diffusers/models/unet_2d.py           | 23 ++++----
 src/diffusers/models/unet_2d_condition.py | 23 ++++----
 tests/test_modeling_utils.py              | 16 ++---
 6 files changed, 74 insertions(+), 66 deletions(-)

diff --git a/scripts/change_configs.py b/scripts/change_configs.py
index c8d714dbe9a8..7f50f0cdb4df 100644
--- a/scripts/change_configs.py
+++ b/scripts/change_configs.py
@@ -16,10 +16,15 @@
 
 import argparse
 import os
+import json
 import torch
 from diffusers import UNet2DModel, UNet2DConditionModel
 from transformers.file_utils import has_file
 
+do_only_config = False
+do_only_weights = True
+do_only_renaming = False
+
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -42,6 +47,8 @@
         "image_size": "sample_size",
         "num_res_blocks": "layers_per_block",
         "block_channels": "block_out_channels",
+        "down_blocks": "down_block_types",
+        "up_blocks": "up_block_types",
         "downscale_freq_shift": "freq_shift",
         "resnet_num_groups": "norm_num_groups",
         "resnet_act_fn": "act_fn",
@@ -52,45 +59,50 @@
     key_parameters_to_change = {
         "time_steps": "time_proj",
         "mid": "mid_block",
-        "downsample_blocks": "down_blocks",
-        "upsample_blocks": "up_blocks",
+        "downsample_blocks": "down_block_types",
+        "upsample_blocks": "up_block_types",
     }
 
+    subfolder = "" if has_file(args.repo_path, "config.json") else "unet"
+
+    with open(os.path.join(args.repo_path, subfolder, "config.json"), "r", encoding="utf-8") as reader:
+        text = reader.read()
+        config = json.loads(text)
+
+    if do_only_config:
+        for key in config_parameters_to_change.keys():
+            config.pop(key, None)
+
     if has_file(args.repo_path, "config.json"):
-        model = UNet2DModel.from_config(args.repo_path)
-        subfolder = ""
+        model = UNet2DModel(**config)
     else:
-        subfolder = "unet"
         class_name = UNet2DConditionModel if "ldm-text2im-large-256" in args.repo_path else UNet2DModel
-        model = class_name.from_config(args.repo_path, subfolder=subfolder)
+        model = class_name(**config)
 
-    config = dict(model.config)
+    if do_only_config:
+        model.save_config(os.path.join(args.repo_path, subfolder))
 
-    for key, value in config_parameters_to_change.items():
-        if key in config:
-            config[value] = config[key]
-            del config[key]
+    config = dict(model.config)
 
-    config["down_blocks"] = [k.replace("UNetRes", "") for k in config["down_blocks"]]
-    config["up_blocks"] = [k.replace("UNetRes", "") for k in config["up_blocks"]]
+    if do_only_renaming:
+        for key, value in config_parameters_to_change.items():
+            if key in config:
+                config[value] = config[key]
+                del config[key]
 
-    if has_file(args.repo_path, "config.json"):
-        model = UNet2DModel(**config)
-    else:
-        model = UNet2DConditionModel(**config)
+        config["down_block_types"] = [k.replace("UNetRes", "") for k in config["down_block_types"]]
+        config["up_block_types"] = [k.replace("UNetRes", "") for k in config["up_block_types"]]
 
-    state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
+    if do_only_weights:
+        state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
 
-    new_state_dict = {}
-    for key, new_key in key_parameters_to_change.items():
-        for param_key, param_value in state_dict.items():
-            if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
-                continue
-            else:
-                new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
+        new_state_dict = {}
+        for key, new_key in key_parameters_to_change.items():
+            for param_key, param_value in state_dict.items():
+                if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
+                    continue
+                else:
+                    new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
 
-    model.load_state_dict(state_dict)
-    if has_file(args.repo_path, "config.json"):
-        model.save_pretrained(args.repo_path)
-    else:
-        model.save_pretrained(os.path.join(args.repo_path, "unet"))
+        model.load_state_dict(state_dict)
+        model.save_pretrained(os.path.join(args.repo_path, subfolder, "unet"))
diff --git a/src/diffusers/configuration_utils.py b/src/diffusers/configuration_utils.py
index e676f9fad6e4..71cb9b7315fd 100644
--- a/src/diffusers/configuration_utils.py
+++ b/src/diffusers/configuration_utils.py
@@ -94,9 +94,7 @@ def save_config(self, save_directory: Union[str, os.PathLike], push_to_hub: bool
         logger.info(f"ConfigMixinuration saved in {output_config_file}")
 
     @classmethod
-    def from_config(
-        cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs
-    ):
+    def from_config(cls, pretrained_model_name_or_path: Union[str, os.PathLike], return_unused_kwargs=False, **kwargs):
         config_dict = cls.get_config_dict(pretrained_model_name_or_path=pretrained_model_name_or_path, **kwargs)
 
         init_dict, unused_kwargs = cls.extract_init_dict(config_dict, **kwargs)
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index ade7db825cf8..ce0fbd4b2b8d 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -78,9 +78,9 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
         if name == "conv":
+            self.Conv2d_0 = conv
             self.conv = conv
         elif name == "Conv2d_0":
-            self.Conv2d_0 = conv
             self.conv = conv
         else:
             self.op = conv
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 93a35858616e..5221d2e99d60 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -10,27 +10,26 @@
 
 
 class UNet2DModel(ModelMixin, ConfigMixin):
-
     @register_to_config
     def __init__(
         self,
-        sample_size = None,
+        sample_size=None,
         in_channels=3,
         out_channels=3,
         center_input_sample=False,
         time_embedding_type="positional",
-        freq_shift = 0,
+        freq_shift=0,
         flip_sin_to_cos=True,
-        down_blocks=("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
-        up_blocks=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        down_block_types=("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
         block_out_channels=(224, 448, 672, 896),
-        layers_per_block = 2,
+        layers_per_block=2,
         mid_block_scale_factor=1,
         downsample_padding=1,
-        act_fn = "silu",
-        attention_head_dim = 8,
-        norm_num_groups = 32,
-        norm_eps = 1e-5,
+        act_fn="silu",
+        attention_head_dim=8,
+        norm_num_groups=32,
+        norm_eps=1e-5,
     ):
         super().__init__()
 
@@ -56,7 +55,7 @@ def __init__(
 
         # down
         output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_blocks):
+        for i, down_block_type in enumerate(down_block_types):
             input_channel = output_channel
             output_channel = block_out_channels[i]
             is_final_block = i == len(block_out_channels) - 1
@@ -90,7 +89,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_blocks):
+        for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
             input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index b8818fac16e8..cb68a9ac5f80 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -10,26 +10,25 @@
 
 
 class UNet2DConditionModel(ModelMixin, ConfigMixin):
-
     @register_to_config
     def __init__(
         self,
-        sample_size = None,
+        sample_size=None,
         in_channels=4,
         out_channels=4,
         center_input_sample=False,
         flip_sin_to_cos=True,
-        freq_shift = 0,
-        down_blocks=("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
-        up_blocks=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
+        freq_shift=0,
+        down_block_types=("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D"),
+        up_block_types=("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D"),
         block_out_channels=(320, 640, 1280, 1280),
-        layers_per_block = 2,
+        layers_per_block=2,
         downsample_padding=1,
         mid_block_scale_factor=1,
-        act_fn = "silu",
-        norm_num_groups = 32,
-        norm_eps = 1e-5,
-        attention_head_dim = 8,
+        act_fn="silu",
+        norm_num_groups=32,
+        norm_eps=1e-5,
+        attention_head_dim=8,
     ):
         super().__init__()
 
@@ -51,7 +50,7 @@ def __init__(
 
         # down
         output_channel = block_out_channels[0]
-        for i, down_block_type in enumerate(down_blocks):
+        for i, down_block_type in enumerate(down_block_types):
             input_channel = output_channel
             output_channel = block_out_channels[i]
             is_final_block = i == len(block_out_channels) - 1
@@ -85,7 +84,7 @@ def __init__(
         # up
         reversed_block_out_channels = list(reversed(block_out_channels))
         output_channel = reversed_block_out_channels[0]
-        for i, up_block_type in enumerate(up_blocks):
+        for i, up_block_type in enumerate(up_block_types):
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
             input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
diff --git a/tests/test_modeling_utils.py b/tests/test_modeling_utils.py
index f4d1f20d8c52..5df13f3a5e4f 100755
--- a/tests/test_modeling_utils.py
+++ b/tests/test_modeling_utils.py
@@ -294,8 +294,8 @@ def output_shape(self):
     def prepare_init_args_and_inputs_for_common(self):
         init_dict = {
             "block_out_channels": (32, 64),
-            "down_blocks": ("DownBlock2D", "AttnDownBlock2D"),
-            "up_blocks": ("AttnUpBlock2D", "UpBlock2D"),
+            "down_block_types": ("DownBlock2D", "AttnDownBlock2D"),
+            "up_block_types": ("AttnUpBlock2D", "UpBlock2D"),
             "attention_head_dim": None,
             "out_channels": 3,
             "in_channels": 3,
@@ -358,8 +358,8 @@ def prepare_init_args_and_inputs_for_common(self):
             "layers_per_block": 2,
             "block_out_channels": (32, 64),
             "attention_head_dim": 32,
-            "down_blocks": ("DownBlock2D", "DownBlock2D"),
-            "up_blocks": ("UpBlock2D", "UpBlock2D"),
+            "down_block_types": ("DownBlock2D", "DownBlock2D"),
+            "up_block_types": ("UpBlock2D", "UpBlock2D"),
         }
         inputs_dict = self.dummy_input
         return init_dict, inputs_dict
@@ -455,13 +455,13 @@ def prepare_init_args_and_inputs_for_common(self):
             "norm_eps": 1e-6,
             "mid_block_scale_factor": math.sqrt(2.0),
             "norm_num_groups": None,
-            "down_blocks": [
+            "down_block_types": [
                 "SkipDownBlock2D",
                 "AttnSkipDownBlock2D",
                 "SkipDownBlock2D",
                 "SkipDownBlock2D",
             ],
-            "up_blocks": [
+            "up_block_types": [
                 "SkipUpBlock2D",
                 "SkipUpBlock2D",
                 "AttnSkipUpBlock2D",
@@ -694,8 +694,8 @@ def test_from_pretrained_save_pretrained(self):
             sample_size=32,
             in_channels=3,
             out_channels=3,
-            down_blocks=("DownBlock2D", "AttnDownBlock2D"),
-            up_blocks=("AttnUpBlock2D", "UpBlock2D"),
+            down_block_types=("DownBlock2D", "AttnDownBlock2D"),
+            up_block_types=("AttnUpBlock2D", "UpBlock2D"),
         )
         schedular = DDPMScheduler(num_train_timesteps=10)
 

From 899182e5e26a42d411971a8c41dd7221dcee060d Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 23:22:06 +0000
Subject: [PATCH 11/12] rename all weights

---
 scripts/change_configs.py                 | 26 ++++----
 scripts/generate_logits.py                |  2 +-
 src/diffusers/modeling_utils.py           | 79 +++++++++++------------
 src/diffusers/models/resnet.py            |  1 -
 src/diffusers/models/unet_2d.py           | 24 +++----
 src/diffusers/models/unet_2d_condition.py | 22 +++----
 6 files changed, 78 insertions(+), 76 deletions(-)

diff --git a/scripts/change_configs.py b/scripts/change_configs.py
index 7f50f0cdb4df..20e1d5c7520a 100644
--- a/scripts/change_configs.py
+++ b/scripts/change_configs.py
@@ -59,8 +59,8 @@
     key_parameters_to_change = {
         "time_steps": "time_proj",
         "mid": "mid_block",
-        "downsample_blocks": "down_block_types",
-        "upsample_blocks": "up_block_types",
+        "downsample_blocks": "down_blocks",
+        "upsample_blocks": "up_blocks",
     }
 
     subfolder = "" if has_file(args.repo_path, "config.json") else "unet"
@@ -97,12 +97,16 @@
         state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
 
         new_state_dict = {}
-        for key, new_key in key_parameters_to_change.items():
-            for param_key, param_value in state_dict.items():
-                if param_key.endswith(".op") or param_key.endswith(".Conv2d_0"):
-                    continue
-                else:
-                    new_state_dict[param_key.replace(key, new_key) if param_key.startswith(key) else param_key] = param_value
-
-        model.load_state_dict(state_dict)
-        model.save_pretrained(os.path.join(args.repo_path, subfolder, "unet"))
+        for param_key, param_value in state_dict.items():
+            if param_key.endswith(".op.bias") or param_key.endswith(".op.weight"):
+                continue
+            has_changed = False
+            for key, new_key in key_parameters_to_change.items():
+                if not has_changed and param_key.split(".")[0] == key:
+                    new_state_dict[".".join([new_key] + param_key.split(".")[1:])] = param_value
+                    has_changed = True
+            if not has_changed:
+                new_state_dict[param_key] = param_value
+
+        model.load_state_dict(new_state_dict)
+        model.save_pretrained(os.path.join(args.repo_path, subfolder))
diff --git a/scripts/generate_logits.py b/scripts/generate_logits.py
index d586c4994d8e..4dbe30f7e57f 100644
--- a/scripts/generate_logits.py
+++ b/scripts/generate_logits.py
@@ -87,5 +87,5 @@
         with torch.no_grad():
             logits = model(noise, time_step)['sample']
 
-        torch.allclose(logits[0, 0, 0, :30], results["_".join("_".join(mod.modelId.split("/")).split("-"))], atol=1e-3)
+        assert torch.allclose(logits[0, 0, 0, :30], results["_".join("_".join(mod.modelId.split("/")).split("-"))], atol=1e-3)
         print(f"{mod.modelId} has passed succesfully!!!")
diff --git a/src/diffusers/modeling_utils.py b/src/diffusers/modeling_utils.py
index 8d20336312d9..44a696ca8d60 100644
--- a/src/diffusers/modeling_utils.py
+++ b/src/diffusers/modeling_utils.py
@@ -497,46 +497,45 @@ def _find_mismatched_keys(
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
-        if False:
-            if len(unexpected_keys) > 0:
-                logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                    f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                    " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                    " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                    f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                    " identical (initializing a BertForSequenceClassification model from a"
-                    " BertForSequenceClassification model)."
-                )
-            else:
-                logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-            if len(missing_keys) > 0:
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                    " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-                )
-            elif len(mismatched_keys) == 0:
-                logger.info(
-                    f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                    f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                    " without further training."
-                )
-            if len(mismatched_keys) > 0:
-                mismatched_warning = "\n".join(
-                    [
-                        f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                        for key, shape1, shape2 in mismatched_keys
-                    ]
-                )
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                    f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                    " able to use it for predictions and inference."
-                )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
diff --git a/src/diffusers/models/resnet.py b/src/diffusers/models/resnet.py
index ce0fbd4b2b8d..a54199c1a242 100644
--- a/src/diffusers/models/resnet.py
+++ b/src/diffusers/models/resnet.py
@@ -83,7 +83,6 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=
         elif name == "Conv2d_0":
             self.conv = conv
         else:
-            self.op = conv
             self.conv = conv
 
     def forward(self, x):
diff --git a/src/diffusers/models/unet_2d.py b/src/diffusers/models/unet_2d.py
index 5221d2e99d60..6203d76f2586 100644
--- a/src/diffusers/models/unet_2d.py
+++ b/src/diffusers/models/unet_2d.py
@@ -41,17 +41,17 @@ def __init__(
 
         # time
         if time_embedding_type == "fourier":
-            self.time_steps = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
             timestep_input_dim = 2 * block_out_channels[0]
         elif time_embedding_type == "positional":
-            self.time_steps = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
             timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
-        self.downsample_blocks = nn.ModuleList([])
-        self.mid = None
-        self.upsample_blocks = nn.ModuleList([])
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
 
         # down
         output_channel = block_out_channels[0]
@@ -72,10 +72,10 @@ def __init__(
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
             )
-            self.downsample_blocks.append(down_block)
+            self.down_blocks.append(down_block)
 
         # mid
-        self.mid = UNetMidBlock2D(
+        self.mid_block = UNetMidBlock2D(
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
             resnet_eps=norm_eps,
@@ -108,7 +108,7 @@ def __init__(
                 resnet_act_fn=act_fn,
                 attn_num_head_channels=attention_head_dim,
             )
-            self.upsample_blocks.append(up_block)
+            self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
@@ -132,7 +132,7 @@ def forward(
         elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
-        t_emb = self.time_steps(timesteps)
+        t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
 
         # 2. pre-process
@@ -141,7 +141,7 @@ def forward(
 
         # 3. down
         down_block_res_samples = (sample,)
-        for downsample_block in self.downsample_blocks:
+        for downsample_block in self.down_blocks:
             if hasattr(downsample_block, "skip_conv"):
                 sample, res_samples, skip_sample = downsample_block(
                     hidden_states=sample, temb=emb, skip_sample=skip_sample
@@ -152,11 +152,11 @@ def forward(
             down_block_res_samples += res_samples
 
         # 4. mid
-        sample = self.mid(sample, emb)
+        sample = self.mid_block(sample, emb)
 
         # 5. up
         skip_sample = None
-        for upsample_block in self.upsample_blocks:
+        for upsample_block in self.up_blocks:
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
 
diff --git a/src/diffusers/models/unet_2d_condition.py b/src/diffusers/models/unet_2d_condition.py
index cb68a9ac5f80..ae82e202bfa9 100644
--- a/src/diffusers/models/unet_2d_condition.py
+++ b/src/diffusers/models/unet_2d_condition.py
@@ -39,14 +39,14 @@ def __init__(
         self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
 
         # time
-        self.time_steps = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
         timestep_input_dim = block_out_channels[0]
 
         self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
 
-        self.downsample_blocks = nn.ModuleList([])
-        self.mid = None
-        self.upsample_blocks = nn.ModuleList([])
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
 
         # down
         output_channel = block_out_channels[0]
@@ -67,10 +67,10 @@ def __init__(
                 attn_num_head_channels=attention_head_dim,
                 downsample_padding=downsample_padding,
             )
-            self.downsample_blocks.append(down_block)
+            self.down_blocks.append(down_block)
 
         # mid
-        self.mid = UNetMidBlock2DCrossAttn(
+        self.mid_block = UNetMidBlock2DCrossAttn(
             in_channels=block_out_channels[-1],
             temb_channels=time_embed_dim,
             resnet_eps=norm_eps,
@@ -103,7 +103,7 @@ def __init__(
                 resnet_act_fn=act_fn,
                 attn_num_head_channels=attention_head_dim,
             )
-            self.upsample_blocks.append(up_block)
+            self.up_blocks.append(up_block)
             prev_output_channel = output_channel
 
         # out
@@ -129,7 +129,7 @@ def forward(
         elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
             timesteps = timesteps[None].to(sample.device)
 
-        t_emb = self.time_steps(timesteps)
+        t_emb = self.time_proj(timesteps)
         emb = self.time_embedding(t_emb)
 
         # 2. pre-process
@@ -137,7 +137,7 @@ def forward(
 
         # 3. down
         down_block_res_samples = (sample,)
-        for downsample_block in self.downsample_blocks:
+        for downsample_block in self.down_blocks:
 
             if hasattr(downsample_block, "attentions") and downsample_block.attentions is not None:
                 sample, res_samples = downsample_block(
@@ -149,10 +149,10 @@ def forward(
             down_block_res_samples += res_samples
 
         # 4. mid
-        sample = self.mid(sample, emb, encoder_hidden_states=encoder_hidden_states)
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states)
 
         # 5. up
-        for upsample_block in self.upsample_blocks:
+        for upsample_block in self.up_blocks:
 
             res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
             down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]

From 2a4a0e1950de6f681f5225873dbfd3618ca662b9 Mon Sep 17 00:00:00 2001
From: Patrick von Platen <patrick.v.platen@gmail.com>
Date: Wed, 20 Jul 2022 23:26:06 +0000
Subject: [PATCH 12/12] finish really

---
 scripts/change_configs.py                     | 112 ------------------
 ...t_ddpm_original_checkpoint_to_diffusers.py |  34 +++---
 ...rt_ldm_original_checkpoint_to_diffusers.py |  34 +++---
 ...ncsnpp_original_checkpoint_to_diffusers.py |  12 +-
 4 files changed, 40 insertions(+), 152 deletions(-)
 delete mode 100644 scripts/change_configs.py

diff --git a/scripts/change_configs.py b/scripts/change_configs.py
deleted file mode 100644
index 20e1d5c7520a..000000000000
--- a/scripts/change_configs.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# coding=utf-8
-# Copyright 2022 The HuggingFace Inc. team.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-""" Conversion script for the LDM checkpoints. """
-
-import argparse
-import os
-import json
-import torch
-from diffusers import UNet2DModel, UNet2DConditionModel
-from transformers.file_utils import has_file
-
-do_only_config = False
-do_only_weights = True
-do_only_renaming = False
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-
-    parser.add_argument(
-        "--repo_path",
-        default=None,
-        type=str,
-        required=True,
-        help="The config json file corresponding to the architecture.",
-    )
-
-    parser.add_argument(
-        "--dump_path", default=None, type=str, required=True, help="Path to the output model."
-    )
-
-    args = parser.parse_args()
-
-    config_parameters_to_change = {
-        "image_size": "sample_size",
-        "num_res_blocks": "layers_per_block",
-        "block_channels": "block_out_channels",
-        "down_blocks": "down_block_types",
-        "up_blocks": "up_block_types",
-        "downscale_freq_shift": "freq_shift",
-        "resnet_num_groups": "norm_num_groups",
-        "resnet_act_fn": "act_fn",
-        "resnet_eps": "norm_eps",
-        "num_head_channels": "attention_head_dim",
-    }
-
-    key_parameters_to_change = {
-        "time_steps": "time_proj",
-        "mid": "mid_block",
-        "downsample_blocks": "down_blocks",
-        "upsample_blocks": "up_blocks",
-    }
-
-    subfolder = "" if has_file(args.repo_path, "config.json") else "unet"
-
-    with open(os.path.join(args.repo_path, subfolder, "config.json"), "r", encoding="utf-8") as reader:
-        text = reader.read()
-        config = json.loads(text)
-
-    if do_only_config:
-        for key in config_parameters_to_change.keys():
-            config.pop(key, None)
-
-    if has_file(args.repo_path, "config.json"):
-        model = UNet2DModel(**config)
-    else:
-        class_name = UNet2DConditionModel if "ldm-text2im-large-256" in args.repo_path else UNet2DModel
-        model = class_name(**config)
-
-    if do_only_config:
-        model.save_config(os.path.join(args.repo_path, subfolder))
-
-    config = dict(model.config)
-
-    if do_only_renaming:
-        for key, value in config_parameters_to_change.items():
-            if key in config:
-                config[value] = config[key]
-                del config[key]
-
-        config["down_block_types"] = [k.replace("UNetRes", "") for k in config["down_block_types"]]
-        config["up_block_types"] = [k.replace("UNetRes", "") for k in config["up_block_types"]]
-
-    if do_only_weights:
-        state_dict = torch.load(os.path.join(args.repo_path, subfolder, "diffusion_pytorch_model.bin"))
-
-        new_state_dict = {}
-        for param_key, param_value in state_dict.items():
-            if param_key.endswith(".op.bias") or param_key.endswith(".op.weight"):
-                continue
-            has_changed = False
-            for key, new_key in key_parameters_to_change.items():
-                if not has_changed and param_key.split(".")[0] == key:
-                    new_state_dict[".".join([new_key] + param_key.split(".")[1:])] = param_value
-                    has_changed = True
-            if not has_changed:
-                new_state_dict[param_key] = param_value
-
-        model.load_state_dict(new_state_dict)
-        model.save_pretrained(os.path.join(args.repo_path, subfolder))
diff --git a/scripts/convert_ddpm_original_checkpoint_to_diffusers.py b/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
index 92b64c38ba52..216018c6a82f 100644
--- a/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ddpm_original_checkpoint_to_diffusers.py
@@ -80,7 +80,7 @@ def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_s
             continue
 
         new_path = new_path.replace('down.', 'downsample_blocks.')
-        new_path = new_path.replace('up.', 'upsample_blocks.')
+        new_path = new_path.replace('up.', 'up_blocks.')
 
         if additional_replacements is not None:
             for replacement in additional_replacements:
@@ -114,8 +114,8 @@ def convert_ddpm_checkpoint(checkpoint, config):
     num_downsample_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'down' in layer})
     downsample_blocks = {layer_id: [key for key in checkpoint if f'down.{layer_id}' in key] for layer_id in range(num_downsample_blocks)}
 
-    num_upsample_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'up' in layer})
-    upsample_blocks = {layer_id: [key for key in checkpoint if f'up.{layer_id}' in key] for layer_id in range(num_upsample_blocks)}
+    num_up_blocks = len({'.'.join(layer.split('.')[:2]) for layer in checkpoint if 'up' in layer})
+    up_blocks = {layer_id: [key for key in checkpoint if f'up.{layer_id}' in key] for layer_id in range(num_up_blocks)}
 
     for i in range(num_downsample_blocks):
         block_id = (i - 1) // (config['num_res_blocks'] + 1)
@@ -164,34 +164,34 @@ def convert_ddpm_checkpoint(checkpoint, config):
         {'old': 'mid.', 'new': 'mid_new_2.'}, {'old': 'attn_1', 'new': 'attentions.0'}
     ])
 
-    for i in range(num_upsample_blocks):
-        block_id = num_upsample_blocks - 1 - i
+    for i in range(num_up_blocks):
+        block_id = num_up_blocks - 1 - i
 
-        if any('upsample' in layer for layer in upsample_blocks[i]):
-            new_checkpoint[f'upsample_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'up.{i}.upsample.conv.weight']
-            new_checkpoint[f'upsample_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'up.{i}.upsample.conv.bias']
+        if any('upsample' in layer for layer in up_blocks[i]):
+            new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'up.{i}.upsample.conv.weight']
+            new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'up.{i}.upsample.conv.bias']
 
-        if any('block' in layer for layer in upsample_blocks[i]):
-            num_blocks = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in upsample_blocks[i] if 'block' in layer})
-            blocks = {layer_id: [key for key in upsample_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any('block' in layer for layer in up_blocks[i]):
+            num_blocks = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in up_blocks[i] if 'block' in layer})
+            blocks = {layer_id: [key for key in up_blocks[i] if f'block.{layer_id}' in key] for layer_id in range(num_blocks)}
 
             if num_blocks > 0:
                 for j in range(config['num_res_blocks'] + 1):
-                    replace_indices = {'old': f'upsample_blocks.{i}', 'new': f'upsample_blocks.{block_id}'}
+                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
                     paths = renew_resnet_paths(blocks[j])
                     assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
 
-        if any('attn' in layer for layer in upsample_blocks[i]):
-            num_attn = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in upsample_blocks[i] if 'attn' in layer})
-            attns = {layer_id: [key for key in upsample_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
+        if any('attn' in layer for layer in up_blocks[i]):
+            num_attn = len({'.'.join(shave_segments(layer, 2).split('.')[:2]) for layer in up_blocks[i] if 'attn' in layer})
+            attns = {layer_id: [key for key in up_blocks[i] if f'attn.{layer_id}' in key] for layer_id in range(num_blocks)}
 
             if num_attn > 0:
                 for j in range(config['num_res_blocks'] + 1):
-                    replace_indices = {'old': f'upsample_blocks.{i}', 'new': f'upsample_blocks.{block_id}'}
+                    replace_indices = {'old': f'up_blocks.{i}', 'new': f'up_blocks.{block_id}'}
                     paths = renew_attention_paths(attns[j])
                     assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[replace_indices])
 
-    new_checkpoint = {k.replace('mid_new_2', 'mid'): v for k, v in new_checkpoint.items()}
+    new_checkpoint = {k.replace('mid_new_2', 'mid_block'): v for k, v in new_checkpoint.items()}
     return new_checkpoint
 
 
diff --git a/scripts/convert_ldm_original_checkpoint_to_diffusers.py b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
index 30dfa8310cf6..3116bb27547a 100644
--- a/scripts/convert_ldm_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ldm_original_checkpoint_to_diffusers.py
@@ -207,14 +207,14 @@ def convert_ldm_checkpoint(checkpoint, config):
     attentions_paths = renew_attention_paths(attentions)
     to_split = {
         'middle_block.1.qkv.bias': {
-            'key': 'mid.attentions.0.key.bias',
-            'query': 'mid.attentions.0.query.bias',
-            'value': 'mid.attentions.0.value.bias',
+            'key': 'mid_block.attentions.0.key.bias',
+            'query': 'mid_block.attentions.0.query.bias',
+            'value': 'mid_block.attentions.0.value.bias',
         },
         'middle_block.1.qkv.weight': {
-            'key': 'mid.attentions.0.key.weight',
-            'query': 'mid.attentions.0.query.weight',
-            'value': 'mid.attentions.0.value.weight',
+            'key': 'mid_block.attentions.0.key.weight',
+            'query': 'mid_block.attentions.0.query.weight',
+            'value': 'mid_block.attentions.0.value.weight',
         },
     }
     assign_to_checkpoint(attentions_paths, new_checkpoint, checkpoint, attention_paths_to_split=to_split, config=config)
@@ -239,13 +239,13 @@ def convert_ldm_checkpoint(checkpoint, config):
             resnet_0_paths = renew_resnet_paths(resnets)
             paths = renew_resnet_paths(resnets)
 
-            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'upsample_blocks.{block_id}.resnets.{layer_in_block_id}'}
+            meta_path = {'old': f'output_blocks.{i}.0', 'new': f'up_blocks.{block_id}.resnets.{layer_in_block_id}'}
             assign_to_checkpoint(paths, new_checkpoint, checkpoint, additional_replacements=[meta_path], config=config)
 
             if ['conv.weight', 'conv.bias'] in output_block_list.values():
                 index = list(output_block_list.values()).index(['conv.weight', 'conv.bias'])
-                new_checkpoint[f'upsample_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'output_blocks.{i}.{index}.conv.weight']
-                new_checkpoint[f'upsample_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'output_blocks.{i}.{index}.conv.bias']
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.weight'] = checkpoint[f'output_blocks.{i}.{index}.conv.weight']
+                new_checkpoint[f'up_blocks.{block_id}.upsamplers.0.conv.bias'] = checkpoint[f'output_blocks.{i}.{index}.conv.bias']
 
                 # Clear attentions as they have been attributed above.
                 if len(attentions) == 2:
@@ -255,18 +255,18 @@ def convert_ldm_checkpoint(checkpoint, config):
                 paths = renew_attention_paths(attentions)
                 meta_path = {
                     'old': f'output_blocks.{i}.1',
-                    'new': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}'
+                    'new': f'up_blocks.{block_id}.attentions.{layer_in_block_id}'
                 }
                 to_split = {
                     f'output_blocks.{i}.1.qkv.bias': {
-                        'key': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
-                        'query': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
-                        'value': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
+                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.bias',
+                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.bias',
+                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.bias',
                     },
                     f'output_blocks.{i}.1.qkv.weight': {
-                        'key': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
-                        'query': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
-                        'value': f'upsample_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
+                        'key': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.key.weight',
+                        'query': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.query.weight',
+                        'value': f'up_blocks.{block_id}.attentions.{layer_in_block_id}.value.weight',
                     },
                 }
                 assign_to_checkpoint(
@@ -281,7 +281,7 @@ def convert_ldm_checkpoint(checkpoint, config):
             resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
             for path in resnet_0_paths:
                 old_path = '.'.join(['output_blocks', str(i), path['old']])
-                new_path = '.'.join(['upsample_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
+                new_path = '.'.join(['up_blocks', str(block_id), 'resnets', str(layer_in_block_id), path['new']])
 
                 new_checkpoint[new_path] = checkpoint[old_path]
 
diff --git a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
index ae179d5f9cbf..8f02d6915486 100644
--- a/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
+++ b/scripts/convert_ncsnpp_original_checkpoint_to_diffusers.py
@@ -25,8 +25,8 @@ def convert_ncsnpp_checkpoint(checkpoint, config):
     Takes a state dict and the path to
     """
     new_model_architecture = UNet2DModel(**config)
-    new_model_architecture.time_steps.W.data = checkpoint["all_modules.0.W"].data
-    new_model_architecture.time_steps.weight.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_proj.W.data = checkpoint["all_modules.0.W"].data
+    new_model_architecture.time_proj.weight.data = checkpoint["all_modules.0.W"].data
     new_model_architecture.time_embedding.linear_1.weight.data = checkpoint["all_modules.1.weight"].data
     new_model_architecture.time_embedding.linear_1.bias.data = checkpoint["all_modules.1.bias"].data
 
@@ -92,14 +92,14 @@ def set_resnet_weights(new_layer, old_checkpoint, index):
             block.skip_conv.bias.data = checkpoint[f"all_modules.{module_index}.Conv_0.bias"].data
             module_index += 1
 
-    set_resnet_weights(new_model_architecture.mid.resnets[0], checkpoint, module_index)
+    set_resnet_weights(new_model_architecture.mid_block.resnets[0], checkpoint, module_index)
     module_index += 1
-    set_attention_weights(new_model_architecture.mid.attentions[0], checkpoint, module_index)
+    set_attention_weights(new_model_architecture.mid_block.attentions[0], checkpoint, module_index)
     module_index += 1
-    set_resnet_weights(new_model_architecture.mid.resnets[1], checkpoint, module_index)
+    set_resnet_weights(new_model_architecture.mid_block.resnets[1], checkpoint, module_index)
     module_index += 1
 
-    for i, block in enumerate(new_model_architecture.upsample_blocks):
+    for i, block in enumerate(new_model_architecture.up_blocks):
         has_attentions = hasattr(block, "attentions")
         for j in range(len(block.resnets)):
             set_resnet_weights(block.resnets[j], checkpoint, module_index)