yoonseokjin
diff --git a/‎__init__.py‎
Lines changed: 1 addition & 1 deletion b/‎__init__.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎configuration_utils.py‎
Lines changed: 4 additions & 4 deletions b/‎configuration_utils.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎modeling_utils.py‎
Lines changed: 42 additions & 43 deletions b/‎modeling_utils.py‎
Lines changed: 42 additions & 43 deletions
diff --git a/‎models/__init__.py‎
Lines changed: 2 additions & 2 deletions b/‎models/__init__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎models/attention.py‎
Lines changed: 1 addition & 8 deletions b/‎models/attention.py‎
Lines changed: 1 addition & 8 deletions
diff --git a/‎models/resnet.py‎
Lines changed: 1 addition & 2 deletions b/‎models/resnet.py‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎models/unet_2d.py‎
Lines changed: 182 additions & 0 deletions b/‎models/unet_2d.py‎
Lines changed: 182 additions & 0 deletions
@@ -7,7 +7,7 @@
 __version__ = "0.0.4"
 
 from .modeling_utils import ModelMixin
-from .models import AutoencoderKL, UNetConditionalModel, UNetUnconditionalModel, VQModel
+from .models import AutoencoderKL, UNet2DConditionModel, UNet2DModel, VQModel
 from .pipeline_utils import DiffusionPipeline
 from .pipelines import DDIMPipeline, DDPMPipeline, LatentDiffusionUncondPipeline, PNDMPipeline, ScoreSdeVePipeline
 from .schedulers import DDIMScheduler, DDPMScheduler, PNDMScheduler, SchedulerMixin, ScoreSdeVeScheduler
 
@@ -161,10 +161,10 @@ def get_config_dict(
 
             except RepositoryNotFoundError:
                 raise EnvironmentError(
-                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier listed"
-                    " on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a token"
-                    " having permission to this repo with `use_auth_token` or log in with `huggingface-cli login` and"
-                    " pass `use_auth_token=True`."
+                    f"{pretrained_model_name_or_path} is not a local folder and is not a valid model identifier"
+                    " listed on 'https://huggingface.co/models'\nIf this is a private repository, make sure to pass a"
+                    " token having permission to this repo with `use_auth_token` or log in with `huggingface-cli"
+                    " login` and pass `use_auth_token=True`."
                 )
             except RevisionNotFoundError:
                 raise EnvironmentError(
 
@@ -34,7 +34,7 @@
 )
 
 
-WEIGHTS_NAME = "diffusion_model.pt"
+WEIGHTS_NAME = "diffusion_pytorch_model.bin"
 
 
 logger = logging.get_logger(__name__)
@@ -147,7 +147,7 @@ class ModelMixin(torch.nn.Module):
           models, `pixel_values` for vision models and `input_values` for speech models).
     """
     config_name = CONFIG_NAME
-    _automatically_saved_args = ["_diffusers_version", "_class_name", "name_or_path"]
+    _automatically_saved_args = ["_diffusers_version", "_class_name", "_name_or_path"]
 
     def __init__(self):
         super().__init__()
@@ -341,7 +341,7 @@ def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.P
             subfolder=subfolder,
             **kwargs,
         )
-        model.register_to_config(name_or_path=pretrained_model_name_or_path)
+        model.register_to_config(_name_or_path=pretrained_model_name_or_path)
         # This variable will flag if we're loading a sharded checkpoint. In this case the archive file is just the
         # Load model
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
@@ -497,46 +497,45 @@ def _find_mismatched_keys(
                 )
             raise RuntimeError(f"Error(s) in loading state_dict for {model.__class__.__name__}:\n\t{error_msg}")
 
-        if False:
-            if len(unexpected_keys) > 0:
-                logger.warning(
-                    f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
-                    f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
-                    f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
-                    " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
-                    " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
-                    f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
-                    " identical (initializing a BertForSequenceClassification model from a"
-                    " BertForSequenceClassification model)."
-                )
-            else:
-                logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
-            if len(missing_keys) > 0:
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
-                    " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
-                )
-            elif len(mismatched_keys) == 0:
-                logger.info(
-                    f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
-                    f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
-                    " without further training."
-                )
-            if len(mismatched_keys) > 0:
-                mismatched_warning = "\n".join(
-                    [
-                        f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
-                        for key, shape1, shape2 in mismatched_keys
-                    ]
-                )
-                logger.warning(
-                    f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
-                    f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
-                    f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
-                    " able to use it for predictions and inference."
-                )
+        if len(unexpected_keys) > 0:
+            logger.warning(
+                f"Some weights of the model checkpoint at {pretrained_model_name_or_path} were not used when"
+                f" initializing {model.__class__.__name__}: {unexpected_keys}\n- This IS expected if you are"
+                f" initializing {model.__class__.__name__} from the checkpoint of a model trained on another task"
+                " or with another architecture (e.g. initializing a BertForSequenceClassification model from a"
+                " BertForPreTraining model).\n- This IS NOT expected if you are initializing"
+                f" {model.__class__.__name__} from the checkpoint of a model that you expect to be exactly"
+                " identical (initializing a BertForSequenceClassification model from a"
+                " BertForSequenceClassification model)."
+            )
+        else:
+            logger.info(f"All model checkpoint weights were used when initializing {model.__class__.__name__}.\n")
+        if len(missing_keys) > 0:
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized: {missing_keys}\nYou should probably"
+                " TRAIN this model on a down-stream task to be able to use it for predictions and inference."
+            )
+        elif len(mismatched_keys) == 0:
+            logger.info(
+                f"All the weights of {model.__class__.__name__} were initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path}.\nIf your task is similar to the task the model of the"
+                f" checkpoint was trained on, you can already use {model.__class__.__name__} for predictions"
+                " without further training."
+            )
+        if len(mismatched_keys) > 0:
+            mismatched_warning = "\n".join(
+                [
+                    f"- {key}: found shape {shape1} in the checkpoint and {shape2} in the model instantiated"
+                    for key, shape1, shape2 in mismatched_keys
+                ]
+            )
+            logger.warning(
+                f"Some weights of {model.__class__.__name__} were not initialized from the model checkpoint at"
+                f" {pretrained_model_name_or_path} and are newly initialized because the shapes did not"
+                f" match:\n{mismatched_warning}\nYou should probably TRAIN this model on a down-stream task to be"
+                " able to use it for predictions and inference."
+            )
 
         return model, missing_keys, unexpected_keys, mismatched_keys, error_msgs
 
 
@@ -16,6 +16,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .unet_conditional import UNetConditionalModel
-from .unet_unconditional import UNetUnconditionalModel
+from .unet_2d import UNet2DModel
+from .unet_2d_condition import UNet2DConditionModel
 from .vae import AutoencoderKL, VQModel
@@ -17,22 +17,15 @@ class AttentionBlockNew(nn.Module):
     def __init__(
         self,
         channels,
-        num_heads=1,
         num_head_channels=None,
         num_groups=32,
         rescale_output_factor=1.0,
         eps=1e-5,
     ):
         super().__init__()
         self.channels = channels
-        if num_head_channels is None:
-            self.num_heads = num_heads
-        else:
-            assert (
-                channels % num_head_channels == 0
-            ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}"
-            self.num_heads = channels // num_head_channels
 
+        self.num_heads = channels // num_head_channels if num_head_channels is not None else 1
         self.num_head_size = num_head_channels
         self.group_norm = nn.GroupNorm(num_channels=channels, num_groups=num_groups, eps=eps, affine=True)
 
 
@@ -78,12 +78,11 @@ def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name=
 
         # TODO(Suraj, Patrick) - clean up after weight dicts are correctly renamed
         if name == "conv":
+            self.Conv2d_0 = conv
             self.conv = conv
         elif name == "Conv2d_0":
-            self.Conv2d_0 = conv
             self.conv = conv
         else:
-            self.op = conv
             self.conv = conv
 
     def forward(self, x):
 
@@ -0,0 +1,182 @@
+from typing import Dict, Union
+
+import torch
+import torch.nn as nn
+
+from ..configuration_utils import ConfigMixin, register_to_config
+from ..modeling_utils import ModelMixin
+from .embeddings import GaussianFourierProjection, TimestepEmbedding, Timesteps
+from .unet_blocks import UNetMidBlock2D, get_down_block, get_up_block
+
+
+class UNet2DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        sample_size=None,
+        in_channels=3,
+        out_channels=3,
+        center_input_sample=False,
+        time_embedding_type="positional",
+        freq_shift=0,
+        flip_sin_to_cos=True,
+        down_block_types=("DownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D", "AttnDownBlock2D"),
+        up_block_types=("AttnUpBlock2D", "AttnUpBlock2D", "AttnUpBlock2D", "UpBlock2D"),
+        block_out_channels=(224, 448, 672, 896),
+        layers_per_block=2,
+        mid_block_scale_factor=1,
+        downsample_padding=1,
+        act_fn="silu",
+        attention_head_dim=8,
+        norm_num_groups=32,
+        norm_eps=1e-5,
+    ):
+        super().__init__()
+
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+
+        # input
+        self.conv_in = nn.Conv2d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+
+        # time
+        if time_embedding_type == "fourier":
+            self.time_proj = GaussianFourierProjection(embedding_size=block_out_channels[0], scale=16)
+            timestep_input_dim = 2 * block_out_channels[0]
+        elif time_embedding_type == "positional":
+            self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+            timestep_input_dim = block_out_channels[0]
+
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=attention_head_dim,
+                downsample_padding=downsample_padding,
+            )
+            self.down_blocks.append(down_block)
+
+        # mid
+        self.mid_block = UNetMidBlock2D(
+            in_channels=block_out_channels[-1],
+            temb_channels=time_embed_dim,
+            resnet_eps=norm_eps,
+            resnet_act_fn=act_fn,
+            output_scale_factor=mid_block_scale_factor,
+            resnet_time_scale_shift="default",
+            attn_num_head_channels=attention_head_dim,
+            resnet_groups=norm_num_groups,
+        )
+
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+
+            is_final_block = i == len(block_out_channels) - 1
+
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                attn_num_head_channels=attention_head_dim,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+
+        # out
+        num_groups_out = norm_num_groups if norm_num_groups is not None else min(block_out_channels[0] // 4, 32)
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=num_groups_out, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = nn.Conv2d(block_out_channels[0], out_channels, 3, padding=1)
+
+    def forward(
+        self, sample: torch.FloatTensor, timestep: Union[torch.Tensor, float, int]
+    ) -> Dict[str, torch.FloatTensor]:
+
+        # 0. center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+
+        # 1. time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            timesteps = torch.tensor([timesteps], dtype=torch.long, device=sample.device)
+        elif torch.is_tensor(timesteps) and len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+
+        t_emb = self.time_proj(timesteps)
+        emb = self.time_embedding(t_emb)
+
+        # 2. pre-process
+        skip_sample = sample
+        sample = self.conv_in(sample)
+
+        # 3. down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "skip_conv"):
+                sample, res_samples, skip_sample = downsample_block(
+                    hidden_states=sample, temb=emb, skip_sample=skip_sample
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+
+            down_block_res_samples += res_samples
+
+        # 4. mid
+        sample = self.mid_block(sample, emb)
+
+        # 5. up
+        skip_sample = None
+        for upsample_block in self.up_blocks:
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+
+            if hasattr(upsample_block, "skip_conv"):
+                sample, skip_sample = upsample_block(sample, res_samples, emb, skip_sample)
+            else:
+                sample = upsample_block(sample, res_samples, emb)
+
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+
+        if skip_sample is not None:
+            sample += skip_sample
+
+        if self.config.time_embedding_type == "fourier":
+            timesteps = timesteps.reshape((sample.shape[0], *([1] * len(sample.shape[1:]))))
+            sample = sample / timesteps
+
+        output = {"sample": sample}
+
+        return output