-
Notifications
You must be signed in to change notification settings - Fork 6.5k
Closed
Labels
bugSomething isn't workingSomething isn't workingstaleIssues that haven't received updatesIssues that haven't received updates
Description
Describe the bug
https://huggingface.co/stabilityai/stable-diffusion-2-1#examples does not run in stated form, but does work if you omit torch_dtype=torch.float16.
Reproduction
import torch
from diffusers import StableDiffusionPipeline, DPMSolverMultistepScheduler
model_id = "stabilityai/stable-diffusion-2-1"
# Use the DPMSolverMultistepScheduler (DPM-Solver++) scheduler here instead
#pipe = StableDiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16) # FAILS
pipe = StableDiffusionPipeline.from_pretrained(model_id) # WORKS
pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
pipe = pipe.to("cuda")
prompt = "a photo of an astronaut riding a horse on mars"
image = pipe(prompt).images[0]
image
Logs
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
Input In [10], in <cell line: 13>()
10 pipe = pipe.to("cuda")
12 prompt = "a photo of an astronaut riding a horse on mars"
---> 13 image = pipe(prompt).images[0]
14 image
16 image.save("astronaut_rides_horse.png")
File /usr/local/lib/python3.9/dist-packages/torch/autograd/grad_mode.py:27, in _DecoratorContextManager.__call__.<locals>.decorate_context(*args, **kwargs)
24 @functools.wraps(func)
25 def decorate_context(*args, **kwargs):
26 with self.clone():
---> 27 return func(*args, **kwargs)
File /usr/local/lib/python3.9/dist-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:484, in __call__(self, prompt, height, width, num_inference_steps, guidance_scale, negative_prompt, num_images_per_prompt, eta, generator, latents, output_type, return_dict, callback, callback_steps)
481 height = height or self.unet.config.sample_size * self.vae_scale_factor
482 width = width or self.unet.config.sample_size * self.vae_scale_factor
--> 484 # 1. Check inputs. Raise error if not correct
485 self.check_inputs(prompt, height, width, callback_steps)
487 # 2. Define call parameters
File /usr/local/lib/python3.9/dist-packages/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py:265, in StableDiffusionPipeline._encode_prompt(self, prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt)
262 else:
263 attention_mask = None
--> 265 text_embeddings = self.text_encoder(
266 text_input_ids.to(device),
267 attention_mask=attention_mask,
268 )
269 text_embeddings = text_embeddings[0]
271 # duplicate text embeddings for each generation per prompt, using mps friendly method
File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:726, in forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
717 if attention_mask is not None:
718 # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
719 attention_mask = _expand_mask(attention_mask, hidden_states.dtype)
721 encoder_outputs = self.encoder(
722 inputs_embeds=hidden_states,
723 attention_mask=attention_mask,
724 causal_attention_mask=causal_attention_mask,
725 output_attentions=output_attentions,
--> 726 output_hidden_states=output_hidden_states,
727 return_dict=return_dict,
728 )
730 last_hidden_state = encoder_outputs[0]
731 last_hidden_state = self.final_layer_norm(last_hidden_state)
File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:647, in forward(self, input_ids, attention_mask, position_ids, output_attentions, output_hidden_states, return_dict)
639 return module(*inputs, output_attentions)
641 return custom_forward
643 layer_outputs = torch.utils.checkpoint.checkpoint(
644 create_custom_forward(encoder_layer),
645 hidden_states,
646 attention_mask,
--> 647 causal_attention_mask,
648 )
649 else:
650 layer_outputs = encoder_layer(
651 hidden_states,
652 attention_mask,
653 causal_attention_mask,
654 output_attentions=output_attentions,
655 )
File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:578, in forward(self, inputs_embeds, attention_mask, causal_attention_mask, output_attentions, output_hidden_states, return_dict)
569 class CLIPEncoder(nn.Module):
570 """
571 Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
572 [`CLIPEncoderLayer`].
(...)
575 config: CLIPConfig
576 """
--> 578 def __init__(self, config: CLIPConfig):
579 super().__init__()
580 self.config = config
File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:321, in forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
318 attn_output = torch.bmm(attn_probs, value_states)
320 if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
--> 321 raise ValueError(
322 f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
323 f" {attn_output.size()}"
324 )
326 attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
327 attn_output = attn_output.transpose(1, 2)
File /usr/local/lib/python3.9/dist-packages/torch/nn/modules/module.py:1130, in Module._call_impl(self, *input, **kwargs)
1126 # If we don't have any hooks, we want to skip the rest of the logic in
1127 # this function, and just call forward.
1128 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1129 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1130 return forward_call(*input, **kwargs)
1131 # Do not call functions when jit is used
1132 full_backward_hooks, non_full_backward_hooks = [], []
File /usr/local/lib/python3.9/dist-packages/transformers/models/clip/modeling_clip.py:260, in forward(self, hidden_states, attention_mask, causal_attention_mask, output_attentions)
253 def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
254 return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
256 def forward(
257 self,
258 hidden_states: torch.Tensor,
259 attention_mask: Optional[torch.Tensor] = None,
--> 260 causal_attention_mask: Optional[torch.Tensor] = None,
261 output_attentions: Optional[bool] = False,
262 ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
263 """Input shape: Batch x Time x Channel"""
265 bsz, tgt_len, embed_dim = hidden_states.size()
RuntimeError: expected scalar type Half but found Float
### System Info
`diffusers` version: 0.11.1
- Platform: Linux-5.4.0-122-generic-x86_64-with-glibc2.31
- Python version: 3.9.13
- PyTorch version (GPU?): 1.12.0+cu116 (True)
- Huggingface_hub version: 0.11.1
- Transformers version: 4.25.1
- Using GPU in script?: P6000
- Using distributed or parallel set-up in script?: no
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't workingstaleIssues that haven't received updatesIssues that haven't received updates