Skip to content

[BUG] RuntimeError: Invalid MNK = [0, 1408, 2048] #1361

@Creling

Description

@Creling

Describe the bug

Hello, I try to run Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4 using the official code snippest and meet the following error:

Traceback (most recent call last):
  File "/home/foo/PPML/moe-test/qwen_q.py", line 21, in <module>
    generated_ids = model.generate(
  File "/home/foo/.local/lib/python3.10/site-packages/torch/utils/_contextlib.py", line 116, in decorate_context
    return func(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/generation/utils.py", line 2250, in generate
    result = self._sample(
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/generation/utils.py", line 3238, in _sample
    outputs = self(**model_inputs, return_dict=True)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/utils/deprecation.py", line 172, in wrapped_func
    return func(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 1316, in forward
    outputs = self.model(
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 1017, in forward
    layer_outputs = decoder_layer(
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 745, in forward
    hidden_states = self.mlp(hidden_states)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 654, in forward
    current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/transformers/models/qwen2_moe/modeling_qwen2_moe.py", line 280, in forward
    return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/foo/.local/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1750, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/gptqmodel/nn_modules/qlinear/marlin.py", line 375, in forward
    return apply_gptq_marlin_linear(
  File "/home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages/gptqmodel/nn_modules/qlinear/marlin.py", line 139, in apply_gptq_marlin_linear
    output = gptqmodel_marlin_kernels.gptq_marlin_gemm(reshaped_x,
RuntimeError: Invalid MNK = [0, 1408, 2048]

GPU Info

+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 555.42.06              Driver Version: 555.42.06      CUDA Version: 12.5     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|=========================================+========================+======================|
|   0  NVIDIA A800 80GB PCIe          On  |   00000000:40:00.0 Off |                    0 |
| N/A   26C    P0             41W /  300W |       1MiB /  81920MiB |      0%      Default |
|                                         |                        |             Disabled |
+-----------------------------------------+------------------------+----------------------+
                                                                                         
+-----------------------------------------------------------------------------------------+
| Processes:                                                                              |
|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |
|        ID   ID                                                               Usage      |
|=========================================================================================|
|  No running processes found                                                             |
+-----------------------------------------------------------------------------------------+

Software Info

No LSB modules are available.
Distributor ID: Ubuntu
Description:    Ubuntu 22.04.4 LTS
Release:        22.04
Codename:       jammy
Name: gptqmodel
Version: 1.9.0
Summary: A LLM quantization package with user-friendly apis. Based on GPTQ algorithm.
Home-page: https://github.com/ModelCloud/GPTQModel
Author: ModelCloud
Author-email: [email protected]
License: Apache 2.0
Location: /home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages
Requires: accelerate, datasets, device-smi, hf_transfer, huggingface_hub, numpy, packaging, pillow, protobuf, safetensors, threadpoolctl, tokenicer, torch, transformers
Required-by: 
---
Name: torch
Version: 2.6.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: [email protected]
License: BSD-3-Clause
Location: /home/foo/.local/lib/python3.10/site-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-cusparselt-cu12, nvidia-nccl-cu12, nvidia-nvjitlink-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, gptqmodel, optimum, peft, torchaudio, torchvision
---
Name: transformers
Version: 4.50.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: [email protected]
License: Apache 2.0 License
Location: /home/foo/miniconda3/envs/huggingface/lib/python3.10/site-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: gptqmodel, optimum, peft, tokenicer
---
Name: accelerate
Version: 1.4.0
Summary: Accelerate
Home-page: https://github.com/huggingface/accelerate
Author: The HuggingFace team
Author-email: [email protected]
License: Apache
Location: /home/foo/.local/lib/python3.10/site-packages
Requires: huggingface-hub, numpy, packaging, psutil, pyyaml, safetensors, torch
Required-by: gptqmodel, peft
---
Name: triton
Version: 3.2.0
Summary: A language and compiler for custom Deep Learning operations
Home-page: https://github.com/triton-lang/triton/
Author: Philippe Tillet
Author-email: [email protected]
License: 
Location: /home/foo/.local/lib/python3.10/site-packages
Requires: 
Required-by: torch

{
  "architectures": [
    "Qwen2MoeForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 151643,
  "eos_token_id": 151645,
  "decoder_sparse_step": 1,
  "hidden_act": "silu",
  "hidden_size": 2048,
  "initializer_range": 0.02,
  "intermediate_size": 5632,
  "max_position_embeddings": 32768,
  "max_window_layers": 21,
  "model_type": "qwen2_moe",
  "moe_intermediate_size": 1408,
  "norm_topk_prob": false,
  "num_attention_heads": 16,
  "num_experts": 60,
  "num_experts_per_tok": 4,
  "num_hidden_layers": 24,
  "num_key_value_heads": 16,
  "output_router_logits": false,
  "quantization_config": {
    "batch_size": 1,
    "bits": 4,
    "block_name_to_quantize": null,
    "cache_block_outputs": true,
    "damp_percent": 0.01,
    "dataset": null,
    "desc_act": false,
    "exllama_config": {
      "version": 1
    },
    "group_size": 128,
    "max_input_length": null,
    "model_seqlen": null,
    "module_name_preceding_first_block": null,
    "modules_in_block_to_quantize": [
      [
        "self_attn.k_proj",
        "self_attn.v_proj",
        "self_attn.q_proj"
      ],
      [
        "self_attn.o_proj"
      ],
      [
        "mlp.shared_expert.up_proj",
        "mlp.shared_expert.gate_proj"
      ],
      [
        "mlp.shared_expert.down_proj"
      ],
      [
        "mlp.experts.0.up_proj",
        "mlp.experts.1.up_proj",
        "mlp.experts.2.up_proj",
        "mlp.experts.3.up_proj",
        "mlp.experts.4.up_proj",
        "mlp.experts.5.up_proj",
        "mlp.experts.6.up_proj",
        "mlp.experts.7.up_proj",
        "mlp.experts.8.up_proj",
        "mlp.experts.9.up_proj",
        "mlp.experts.10.up_proj",
        "mlp.experts.11.up_proj",
        "mlp.experts.12.up_proj",
        "mlp.experts.13.up_proj",
        "mlp.experts.14.up_proj",
        "mlp.experts.15.up_proj",
        "mlp.experts.16.up_proj",
        "mlp.experts.17.up_proj",
        "mlp.experts.18.up_proj",
        "mlp.experts.19.up_proj",
        "mlp.experts.20.up_proj",
        "mlp.experts.21.up_proj",
        "mlp.experts.22.up_proj",
        "mlp.experts.23.up_proj",
        "mlp.experts.24.up_proj",
        "mlp.experts.25.up_proj",
        "mlp.experts.26.up_proj",
        "mlp.experts.27.up_proj",
        "mlp.experts.28.up_proj",
        "mlp.experts.29.up_proj",
        "mlp.experts.30.up_proj",
        "mlp.experts.31.up_proj",
        "mlp.experts.32.up_proj",
        "mlp.experts.33.up_proj",
        "mlp.experts.34.up_proj",
        "mlp.experts.35.up_proj",
        "mlp.experts.36.up_proj",
        "mlp.experts.37.up_proj",
        "mlp.experts.38.up_proj",
        "mlp.experts.39.up_proj",
        "mlp.experts.40.up_proj",
        "mlp.experts.41.up_proj",
        "mlp.experts.42.up_proj",
        "mlp.experts.43.up_proj",
        "mlp.experts.44.up_proj",
        "mlp.experts.45.up_proj",
        "mlp.experts.46.up_proj",
        "mlp.experts.47.up_proj",
        "mlp.experts.48.up_proj",
        "mlp.experts.49.up_proj",
        "mlp.experts.50.up_proj",
        "mlp.experts.51.up_proj",
        "mlp.experts.52.up_proj",
        "mlp.experts.53.up_proj",
        "mlp.experts.54.up_proj",
        "mlp.experts.55.up_proj",
        "mlp.experts.56.up_proj",
        "mlp.experts.57.up_proj",
        "mlp.experts.58.up_proj",
        "mlp.experts.59.up_proj",
        "mlp.experts.0.gate_proj",
        "mlp.experts.1.gate_proj",
        "mlp.experts.2.gate_proj",
        "mlp.experts.3.gate_proj",
        "mlp.experts.4.gate_proj",
        "mlp.experts.5.gate_proj",
        "mlp.experts.6.gate_proj",
        "mlp.experts.7.gate_proj",
        "mlp.experts.8.gate_proj",
        "mlp.experts.9.gate_proj",
        "mlp.experts.10.gate_proj",
        "mlp.experts.11.gate_proj",
        "mlp.experts.12.gate_proj",
        "mlp.experts.13.gate_proj",
        "mlp.experts.14.gate_proj",
        "mlp.experts.15.gate_proj",
        "mlp.experts.16.gate_proj",
        "mlp.experts.17.gate_proj",
        "mlp.experts.18.gate_proj",
        "mlp.experts.19.gate_proj",
        "mlp.experts.20.gate_proj",
        "mlp.experts.21.gate_proj",
        "mlp.experts.22.gate_proj",
        "mlp.experts.23.gate_proj",
        "mlp.experts.24.gate_proj",
        "mlp.experts.25.gate_proj",
        "mlp.experts.26.gate_proj",
        "mlp.experts.27.gate_proj",
        "mlp.experts.28.gate_proj",
        "mlp.experts.29.gate_proj",
        "mlp.experts.30.gate_proj",
        "mlp.experts.31.gate_proj",
        "mlp.experts.32.gate_proj",
        "mlp.experts.33.gate_proj",
        "mlp.experts.34.gate_proj",
        "mlp.experts.35.gate_proj",
        "mlp.experts.36.gate_proj",
        "mlp.experts.37.gate_proj",
        "mlp.experts.38.gate_proj",
        "mlp.experts.39.gate_proj",
        "mlp.experts.40.gate_proj",
        "mlp.experts.41.gate_proj",
        "mlp.experts.42.gate_proj",
        "mlp.experts.43.gate_proj",
        "mlp.experts.44.gate_proj",
        "mlp.experts.45.gate_proj",
        "mlp.experts.46.gate_proj",
        "mlp.experts.47.gate_proj",
        "mlp.experts.48.gate_proj",
        "mlp.experts.49.gate_proj",
        "mlp.experts.50.gate_proj",
        "mlp.experts.51.gate_proj",
        "mlp.experts.52.gate_proj",
        "mlp.experts.53.gate_proj",
        "mlp.experts.54.gate_proj",
        "mlp.experts.55.gate_proj",
        "mlp.experts.56.gate_proj",
        "mlp.experts.57.gate_proj",
        "mlp.experts.58.gate_proj",
        "mlp.experts.59.gate_proj"
      ],
      [
        "mlp.experts.0.down_proj",
        "mlp.experts.1.down_proj",
        "mlp.experts.2.down_proj",
        "mlp.experts.3.down_proj",
        "mlp.experts.4.down_proj",
        "mlp.experts.5.down_proj",
        "mlp.experts.6.down_proj",
        "mlp.experts.7.down_proj",
        "mlp.experts.8.down_proj",
        "mlp.experts.9.down_proj",
        "mlp.experts.10.down_proj",
        "mlp.experts.11.down_proj",
        "mlp.experts.12.down_proj",
        "mlp.experts.13.down_proj",
        "mlp.experts.14.down_proj",
        "mlp.experts.15.down_proj",
        "mlp.experts.16.down_proj",
        "mlp.experts.17.down_proj",
        "mlp.experts.18.down_proj",
        "mlp.experts.19.down_proj",
        "mlp.experts.20.down_proj",
        "mlp.experts.21.down_proj",
        "mlp.experts.22.down_proj",
        "mlp.experts.23.down_proj",
        "mlp.experts.24.down_proj",
        "mlp.experts.25.down_proj",
        "mlp.experts.26.down_proj",
        "mlp.experts.27.down_proj",
        "mlp.experts.28.down_proj",
        "mlp.experts.29.down_proj",
        "mlp.experts.30.down_proj",
        "mlp.experts.31.down_proj",
        "mlp.experts.32.down_proj",
        "mlp.experts.33.down_proj",
        "mlp.experts.34.down_proj",
        "mlp.experts.35.down_proj",
        "mlp.experts.36.down_proj",
        "mlp.experts.37.down_proj",
        "mlp.experts.38.down_proj",
        "mlp.experts.39.down_proj",
        "mlp.experts.40.down_proj",
        "mlp.experts.41.down_proj",
        "mlp.experts.42.down_proj",
        "mlp.experts.43.down_proj",
        "mlp.experts.44.down_proj",
        "mlp.experts.45.down_proj",
        "mlp.experts.46.down_proj",
        "mlp.experts.47.down_proj",
        "mlp.experts.48.down_proj",
        "mlp.experts.49.down_proj",
        "mlp.experts.50.down_proj",
        "mlp.experts.51.down_proj",
        "mlp.experts.52.down_proj",
        "mlp.experts.53.down_proj",
        "mlp.experts.54.down_proj",
        "mlp.experts.55.down_proj",
        "mlp.experts.56.down_proj",
        "mlp.experts.57.down_proj",
        "mlp.experts.58.down_proj",
        "mlp.experts.59.down_proj"
      ]
    ],
    "pad_token_id": null,
    "quant_method": "gptq",
    "sym": true,
    "tokenizer": null,
    "true_sequential": true,
    "use_cuda_fp16": false,
    "use_exllama": true
  },
  "rms_norm_eps": 1e-06,
  "rope_theta": 1000000.0,
  "router_aux_loss_coef": 0.001,
  "shared_expert_intermediate_size": 5632,
  "sliding_window": 32768,
  "tie_word_embeddings": false,
  "torch_dtype": "float16",
  "transformers_version": "4.39.0.dev0",
  "use_cache": true,
  "use_sliding_window": false,
  "vocab_size": 151936
}

To Reproduce

just run the following codes:

from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-MoE-A2.7B-Chat")

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]c

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

Model/Datasets

https://huggingface.co/Qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions