Skip to content

stable diffusion 2 float16 mode not working: expected scalar type Half but found Float #1931

@murphyk

Description

@murphyk

Describe the bug

https://huggingface.co/stabilityai/stable-diffusion-2-1#examples does not run in stated form, but does work if you omit torch_dtype=torch.float16.

Reproduction

import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler

model_id = "stabilityai/stable-diffusion-2-1"

# Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead
#pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) # FAILS
pipe = StableDiffusionPipeline.from_pretrained(model_id) # WORKS
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")

prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt).images[0]
image

Logs

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
Input In [10], in <cell line: 13>()
     10 pipe = pipe.to("cuda")
     12 prompt = "a photo of an astronaut riding a horse on mars"
---> 13 image = pipe(prompt).images[0]
     14 image
     16 image.save("astronaut_rides_horse.png")

File /usr/local/lib/python3.9/dist-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
     24 @functools.wraps(func)
     25 def decorate_context(*args, **kwargs):
     26     with self.clone():
---> 27         return func(*args, **kwargs)

File /usr/local/lib/python3.9/dist-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:484, in __call__(self, prompt, height, width, num_inference_steps, guidance_scale, negative_prompt, num_images_per_prompt, eta, generator, latents, output_type, return_dict, callback, callback_steps)
    481 height = height or self.unet.config.sample_size * self.vae_scale_factor
    482 width = width or self.unet.config.sample_size * self.vae_scale_factor
--> 484 # 1. Check inputs. Raise error if not correct
    485 self.check_inputs(prompt, height, width, callback_steps)
    487 # 2. Define call parameters

File /usr/local/lib/python3.9/dist-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:265, in StableDiffusionPipeline._encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt)
    262 else:
    263     attention_mask = None
--> 265 text_embeddings = self.text_encoder(
    266     text_input_ids.to(device),
    267     attention_mask=attention_mask,
    268 )
    269 text_embeddings = text_embeddings[0]
    271 # duplicate text embeddings for each generation per prompt, using mps friendly method

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:726, in forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
    717 if attention_mask is not None:
    718     # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
    719     attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
    721 encoder_outputs = self.encoder(
    722     inputs_embeds=hidden_states,
    723     attention_mask=attention_mask,
    724     causal_attention_mask=causal_attention_mask,
    725     output_attentions=output_attentions,
--> 726     output_hidden_states=output_hidden_states,
    727     return_dict=return_dict,
    728 )
    730 last_hidden_state = encoder_outputs[0]
    731 last_hidden_state = self.final_layer_norm(last_hidden_state)

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:647, in forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
    639             return module(*inputs, output_attentions)
    641         return custom_forward
    643     layer_outputs = torch.utils.checkpoint.checkpoint(
    644         create_custom_forward(encoder_layer),
    645         hidden_states,
    646         attention_mask,
--> 647         causal_attention_mask,
    648     )
    649 else:
    650     layer_outputs = encoder_layer(
    651         hidden_states,
    652         attention_mask,
    653         causal_attention_mask,
    654         output_attentions=output_attentions,
    655     )

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:578, in forward(self, inputs_embeds, attention_mask, causal_attention_mask, output_attentions, output_hidden_states, return_dict)
    569 class CLIPEncoder(nn.Module):
    570     """
    571     Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    572     [`CLIPEncoderLayer`].
   (...)
    575         config: CLIPConfig
    576     """
--> 578     def __init__(self, config: CLIPConfig):
    579         super().__init__()
    580         self.config = config

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:321, in forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
    318 attn_output = torch.bmm(attn_probs, value_states)
    320 if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
--> 321     raise ValueError(
    322         f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
    323         f" {attn_output.size()}"
    324     )
    326 attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
    327 attn_output = attn_output.transpose(1, 2)

File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
   1126 # If we don't have any hooks, we want to skip the rest of the logic in
   1127 # this function, and just call forward.
   1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
   1129         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130     return forward_call(*input, **kwargs)
   1131 # Do not call functions when jit is used
   1132 full_backward_hooks, non_full_backward_hooks = [], []

File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:260, in forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
    253 def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
    254     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
    256 def forward(
    257     self,
    258     hidden_states: torch.Tensor,
    259     attention_mask: Optional[torch.Tensor] = None,
--> 260     causal_attention_mask: Optional[torch.Tensor] = None,
    261     output_attentions: Optional[bool] = False,
    262 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    263     """Input shape: Batch x Time x Channel"""
    265     bsz, tgt_len, embed_dim = hidden_states.size()

RuntimeError: expected scalar type Half but found Float


### System Info

 `diffusers` version: 0.11.1
- Platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31
- Python version: 3.9.13
- PyTorch version (GPU?): 1.12.0+cu116 (True)
- Huggingface_hub version: 0.11.1
- Transformers version: 4.25.1
- Using GPU in script?: P6000
- Using distributed or parallel set-up in script?:  no

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't workingstaleIssues that haven't received updates

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions