-
Notifications
You must be signed in to change notification settings - Fork 123
Closed
Labels
bugSomething isn't workingSomething isn't working
Description
Describe the bug
Hello, I try to run Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4 using the official code snippest and meet the following error:
Traceback (most recent call last):
File "/home/foo/PPML/moe-test/qwen_q.py", line 21, in <module>
generated_ids = model.generate(
File "/home/foo/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
return func(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/generation/utils.py", line 2250, in generate
result = self._sample(
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/generation/utils.py", line 3238, in _sample
outputs = self(**model_inputs, return_dict=True)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
return func(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 1316, in forward
outputs = self.model(
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 1017, in forward
layer_outputs = decoder_layer(
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 745, in forward
hidden_states = self.mlp(hidden_states)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 654, in forward
current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 280, in forward
return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
return forward_call(*args, **kwargs)
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/gptqmodel/nn_modules/qlinear/marlin.py", line 375, in forward
return apply_gptq_marlin_linear(
File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/gptqmodel/nn_modules/qlinear/marlin.py", line 139, in apply_gptq_marlin_linear
output = gptqmodel_marlin_kernels.gptq_marlin_gemm(reshaped_x,
RuntimeError: Invalid MNK = [0, 1408, 2048]
GPU Info
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06 Driver Version: 555.42.06 CUDA Version: 12.5 |
|-----------------------------------------+------------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 NVIDIA A800 80GB PCIe On | 00000000:40:00.0 Off | 0 |
| N/A 26C P0 41W / 300W | 1MiB / 81920MiB | 0% Default |
| | | Disabled |
+-----------------------------------------+------------------------+----------------------+
+-----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| No running processes found |
+-----------------------------------------------------------------------------------------+
Software Info
No LSB modules are available.
Distributor ID: Ubuntu
Description: Ubuntu 22.04.4 LTS
Release: 22.04
Codename: jammy
Name: gptqmodel
Version: 1.9.0
Summary: A LLM quantization package with user-friendly apis. Based on GPTQ algorithm.
Home-page: https://github.com/ModelCloud/GPTQModel
Author: ModelCloud
Author-email: [email protected]
License: Apache 2.0
Location: /home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages
Requires: accelerate, datasets, device-smi, hf_transfer, huggingface_hub, numpy, packaging, pillow, protobuf, safetensors, threadpoolctl, tokenicer, torch, transformers
Required-by:
---
Name: torch
Version: 2.6.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: [email protected]
License: BSD-3-Clause
Location: /home/foo/.local/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-cusparselt-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, gptqmodel, optimum, peft, torchaudio, torchvision
---
Name: transformers
Version: 4.50.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: [email protected]
License: Apache 2.0 License
Location: /home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: gptqmodel, optimum, peft, tokenicer
---
Name: accelerate
Version: 1.4.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: [email protected]
License: Apache
Location: /home/foo/.local/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: gptqmodel, peft
---
Name: triton
Version: 3.2.0
Summary: A language and compiler for custom Deep Learning operations
Home-page: https://github.com/triton-lang/triton/
Author: Philippe Tillet
Author-email: [email protected]
License:
Location: /home/foo/.local/lib/python3.10/site-packages
Requires:
Required-by: torch
{
"architectures": [
"Qwen2MoeForCausalLM"
],
"attention_dropout": 0.0,
"bos_token_id": 151643,
"eos_token_id": 151645,
"decoder_sparse_step": 1,
"hidden_act": "silu",
"hidden_size": 2048,
"initializer_range": 0.02,
"intermediate_size": 5632,
"max_position_embeddings": 32768,
"max_window_layers": 21,
"model_type": "qwen2_moe",
"moe_intermediate_size": 1408,
"norm_topk_prob": false,
"num_attention_heads": 16,
"num_experts": 60,
"num_experts_per_tok": 4,
"num_hidden_layers": 24,
"num_key_value_heads": 16,
"output_router_logits": false,
"quantization_config": {
"batch_size": 1,
"bits": 4,
"block_name_to_quantize": null,
"cache_block_outputs": true,
"damp_percent": 0.01,
"dataset": null,
"desc_act": false,
"exllama_config": {
"version": 1
},
"group_size": 128,
"max_input_length": null,
"model_seqlen": null,
"module_name_preceding_first_block": null,
"modules_in_block_to_quantize": [
[
"self_attn.k_proj",
"self_attn.v_proj",
"self_attn.q_proj"
],
[
"self_attn.o_proj"
],
[
"mlp.shared_expert.up_proj",
"mlp.shared_expert.gate_proj"
],
[
"mlp.shared_expert.down_proj"
],
[
"mlp.experts.0.up_proj",
"mlp.experts.1.up_proj",
"mlp.experts.2.up_proj",
"mlp.experts.3.up_proj",
"mlp.experts.4.up_proj",
"mlp.experts.5.up_proj",
"mlp.experts.6.up_proj",
"mlp.experts.7.up_proj",
"mlp.experts.8.up_proj",
"mlp.experts.9.up_proj",
"mlp.experts.10.up_proj",
"mlp.experts.11.up_proj",
"mlp.experts.12.up_proj",
"mlp.experts.13.up_proj",
"mlp.experts.14.up_proj",
"mlp.experts.15.up_proj",
"mlp.experts.16.up_proj",
"mlp.experts.17.up_proj",
"mlp.experts.18.up_proj",
"mlp.experts.19.up_proj",
"mlp.experts.20.up_proj",
"mlp.experts.21.up_proj",
"mlp.experts.22.up_proj",
"mlp.experts.23.up_proj",
"mlp.experts.24.up_proj",
"mlp.experts.25.up_proj",
"mlp.experts.26.up_proj",
"mlp.experts.27.up_proj",
"mlp.experts.28.up_proj",
"mlp.experts.29.up_proj",
"mlp.experts.30.up_proj",
"mlp.experts.31.up_proj",
"mlp.experts.32.up_proj",
"mlp.experts.33.up_proj",
"mlp.experts.34.up_proj",
"mlp.experts.35.up_proj",
"mlp.experts.36.up_proj",
"mlp.experts.37.up_proj",
"mlp.experts.38.up_proj",
"mlp.experts.39.up_proj",
"mlp.experts.40.up_proj",
"mlp.experts.41.up_proj",
"mlp.experts.42.up_proj",
"mlp.experts.43.up_proj",
"mlp.experts.44.up_proj",
"mlp.experts.45.up_proj",
"mlp.experts.46.up_proj",
"mlp.experts.47.up_proj",
"mlp.experts.48.up_proj",
"mlp.experts.49.up_proj",
"mlp.experts.50.up_proj",
"mlp.experts.51.up_proj",
"mlp.experts.52.up_proj",
"mlp.experts.53.up_proj",
"mlp.experts.54.up_proj",
"mlp.experts.55.up_proj",
"mlp.experts.56.up_proj",
"mlp.experts.57.up_proj",
"mlp.experts.58.up_proj",
"mlp.experts.59.up_proj",
"mlp.experts.0.gate_proj",
"mlp.experts.1.gate_proj",
"mlp.experts.2.gate_proj",
"mlp.experts.3.gate_proj",
"mlp.experts.4.gate_proj",
"mlp.experts.5.gate_proj",
"mlp.experts.6.gate_proj",
"mlp.experts.7.gate_proj",
"mlp.experts.8.gate_proj",
"mlp.experts.9.gate_proj",
"mlp.experts.10.gate_proj",
"mlp.experts.11.gate_proj",
"mlp.experts.12.gate_proj",
"mlp.experts.13.gate_proj",
"mlp.experts.14.gate_proj",
"mlp.experts.15.gate_proj",
"mlp.experts.16.gate_proj",
"mlp.experts.17.gate_proj",
"mlp.experts.18.gate_proj",
"mlp.experts.19.gate_proj",
"mlp.experts.20.gate_proj",
"mlp.experts.21.gate_proj",
"mlp.experts.22.gate_proj",
"mlp.experts.23.gate_proj",
"mlp.experts.24.gate_proj",
"mlp.experts.25.gate_proj",
"mlp.experts.26.gate_proj",
"mlp.experts.27.gate_proj",
"mlp.experts.28.gate_proj",
"mlp.experts.29.gate_proj",
"mlp.experts.30.gate_proj",
"mlp.experts.31.gate_proj",
"mlp.experts.32.gate_proj",
"mlp.experts.33.gate_proj",
"mlp.experts.34.gate_proj",
"mlp.experts.35.gate_proj",
"mlp.experts.36.gate_proj",
"mlp.experts.37.gate_proj",
"mlp.experts.38.gate_proj",
"mlp.experts.39.gate_proj",
"mlp.experts.40.gate_proj",
"mlp.experts.41.gate_proj",
"mlp.experts.42.gate_proj",
"mlp.experts.43.gate_proj",
"mlp.experts.44.gate_proj",
"mlp.experts.45.gate_proj",
"mlp.experts.46.gate_proj",
"mlp.experts.47.gate_proj",
"mlp.experts.48.gate_proj",
"mlp.experts.49.gate_proj",
"mlp.experts.50.gate_proj",
"mlp.experts.51.gate_proj",
"mlp.experts.52.gate_proj",
"mlp.experts.53.gate_proj",
"mlp.experts.54.gate_proj",
"mlp.experts.55.gate_proj",
"mlp.experts.56.gate_proj",
"mlp.experts.57.gate_proj",
"mlp.experts.58.gate_proj",
"mlp.experts.59.gate_proj"
],
[
"mlp.experts.0.down_proj",
"mlp.experts.1.down_proj",
"mlp.experts.2.down_proj",
"mlp.experts.3.down_proj",
"mlp.experts.4.down_proj",
"mlp.experts.5.down_proj",
"mlp.experts.6.down_proj",
"mlp.experts.7.down_proj",
"mlp.experts.8.down_proj",
"mlp.experts.9.down_proj",
"mlp.experts.10.down_proj",
"mlp.experts.11.down_proj",
"mlp.experts.12.down_proj",
"mlp.experts.13.down_proj",
"mlp.experts.14.down_proj",
"mlp.experts.15.down_proj",
"mlp.experts.16.down_proj",
"mlp.experts.17.down_proj",
"mlp.experts.18.down_proj",
"mlp.experts.19.down_proj",
"mlp.experts.20.down_proj",
"mlp.experts.21.down_proj",
"mlp.experts.22.down_proj",
"mlp.experts.23.down_proj",
"mlp.experts.24.down_proj",
"mlp.experts.25.down_proj",
"mlp.experts.26.down_proj",
"mlp.experts.27.down_proj",
"mlp.experts.28.down_proj",
"mlp.experts.29.down_proj",
"mlp.experts.30.down_proj",
"mlp.experts.31.down_proj",
"mlp.experts.32.down_proj",
"mlp.experts.33.down_proj",
"mlp.experts.34.down_proj",
"mlp.experts.35.down_proj",
"mlp.experts.36.down_proj",
"mlp.experts.37.down_proj",
"mlp.experts.38.down_proj",
"mlp.experts.39.down_proj",
"mlp.experts.40.down_proj",
"mlp.experts.41.down_proj",
"mlp.experts.42.down_proj",
"mlp.experts.43.down_proj",
"mlp.experts.44.down_proj",
"mlp.experts.45.down_proj",
"mlp.experts.46.down_proj",
"mlp.experts.47.down_proj",
"mlp.experts.48.down_proj",
"mlp.experts.49.down_proj",
"mlp.experts.50.down_proj",
"mlp.experts.51.down_proj",
"mlp.experts.52.down_proj",
"mlp.experts.53.down_proj",
"mlp.experts.54.down_proj",
"mlp.experts.55.down_proj",
"mlp.experts.56.down_proj",
"mlp.experts.57.down_proj",
"mlp.experts.58.down_proj",
"mlp.experts.59.down_proj"
]
],
"pad_token_id": null,
"quant_method": "gptq",
"sym": true,
"tokenizer": null,
"true_sequential": true,
"use_cuda_fp16": false,
"use_exllama": true
},
"rms_norm_eps": 1e-06,
"rope_theta": 1000000.0,
"router_aux_loss_coef": 0.001,
"shared_expert_intermediate_size": 5632,
"sliding_window": 32768,
"tie_word_embeddings": false,
"torch_dtype": "float16",
"transformers_version": "4.39.0.dev0",
"use_cache": true,
"use_sliding_window": false,
"vocab_size": 151936
}
To Reproduce
just run the following codes:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat")
prompt = "Give me a short introduction to large language model."
messages = [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=512
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]c
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]Model/Datasets
https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working