feat: support ascend qwen2 and qwen2_moe (#6)

yao-fengchen · web-flow · commit 83a1105a9c7f · 2024-08-12T17:26:54.000+08:00
* feat: support ascend qwen2 and qwen2_moe

* fix: fix ascend mixtral
diff --git a/lmdeploy/pytorch/models/mixtral.py b/lmdeploy/pytorch/models/mixtral.py
@@ -292,6 +292,7 @@ def forward(
         past_key_value: Optional[Tuple[torch.Tensor]] = None,
         output_attentions: bool = False,
         use_cache: bool = False,
+        **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
                Optional[Tuple[torch.Tensor]]]:
         """Rewrite of MistralAttention.forward."""
diff --git a/lmdeploy/pytorch/models/module_map.py b/lmdeploy/pytorch/models/module_map.py
@@ -394,3 +394,25 @@
     'transformers.models.mixtral.modeling_mixtral.MixtralSparseMoeBlock':
     f'{LMDEPLOY_PYTORCH_MODEL_PATH}.mixtral.PatchedMixtralSparseMoeBlockAscend',  # noqa: E501
 })
+
+# ascend qwen1.5
+ASCEND_MODULE_MAP.update({
+    'transformers.models.qwen2.modeling_qwen2.Qwen2Attention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+    'transformers.models.qwen2.modeling_qwen2.Qwen2FlashAttention2':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+    'transformers.models.qwen2.modeling_qwen2.Qwen2SdpaAttention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+})
+
+# ascend qwen2 moe
+ASCEND_MODULE_MAP.update({
+    'transformers.models.qwen2_moe.modeling_qwen2_moe.Qwen2MoeAttention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+    'transformers.models.qwen2_moe.modeling_qwen2_moe.Qwen2MoeFlashAttention2':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+    'transformers.models.qwen2_moe.modeling_qwen2_moe.Qwen2MoeSdpaAttention':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2.PatchedQwen2AttentionAscend',
+    'transformers.models.qwen2_moe.modeling_qwen2_moe.Qwen2MoeSparseMoeBlock':
+    f'{LMDEPLOY_PYTORCH_MODEL_PATH}.qwen2_moe.PatchedQwen2MoeSparseMoeBlockAscend',  # noqa: E501
+})
diff --git a/lmdeploy/pytorch/models/qwen2.py b/lmdeploy/pytorch/models/qwen2.py
@@ -146,3 +146,150 @@ def forward(
             past_key_value,
             world_size=world_size,
         )
+
+
+class PatchedQwen2AttentionAscend(nn.Module):
+
+    def _load_weights(self, loader, rank: int, world_size: int,
+                      device: torch.device):
+        """load weights."""
+        for mod_name in ['q_proj', 'k_proj', 'v_proj']:
+            colwise_parallelize_linear(getattr(self, mod_name),
+                                       loader,
+                                       rank=rank,
+                                       world_size=world_size,
+                                       prefix=mod_name)
+        for mod_name in ['o_proj']:
+            rowwise_parallelize_linear(getattr(self, mod_name),
+                                       loader,
+                                       rank=rank,
+                                       world_size=world_size,
+                                       prefix=mod_name)
+
+    @classmethod
+    def _distribute_output_fn(cls, outputs, **kwargs):
+        """Distribution output hook."""
+        dist.all_reduce(outputs[0])
+        return outputs
+
+    def _contiguous_batching_forward_impl(
+        self,
+        hidden_states: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        world_size: int = 1,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Rewrite implementation of forward.
+
+        Add continuous batching support. Add paged attention support. TP
+        support.
+        """
+        context = self.context.context
+        kv_seq_length = context.kv_seq_length
+        q_seq_length = context.q_seq_length
+        q_start_loc = context.q_start_loc
+        block_offsets = context.block_offsets
+        max_q_seq_length = context.max_q_seq_length
+        max_kv_seq_length = context.max_kv_seq_length
+
+        num_heads = self.num_heads // world_size
+        num_kv_heads = self.num_key_value_heads // world_size
+        head_dim = self.head_dim
+        hidden_size = num_heads * head_dim
+
+        def __qkv_proj(hidden_states):
+            """qkv proj."""
+            query_states = self.q_proj(hidden_states)
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+            return query_states, key_states, value_states
+
+        def __rotary_emb_fn(query_states, key_states, value_states):
+            if hasattr(self, 'rotary_emb'):
+                cos, sin = self.rotary_emb(value_states,
+                                           seq_len=max_kv_seq_length)
+                query_states, key_states = apply_rotary_pos_emb(
+                    query_states,
+                    key_states,
+                    cos,
+                    sin,
+                    position_ids,
+                    context.position_ids_1d,
+                    context=context)
+            return query_states, key_states, value_states
+
+        query_states, key_states, value_states = __qkv_proj(hidden_states)
+
+        query_states = query_states.view(-1, num_heads, head_dim)
+        key_states = key_states.view(-1, num_kv_heads, head_dim)
+        value_states = value_states.view(-1, num_kv_heads, head_dim)
+
+        query_states, key_states, value_states = __rotary_emb_fn(
+            query_states, key_states, value_states)
+
+        fill_kv_cache(
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            q_start_loc,
+            q_seq_length,
+            kv_seq_length=kv_seq_length,
+            max_q_seq_length=max_q_seq_length,
+            block_offsets=block_offsets,
+            context=context,
+        )
+
+        attn_output = query_states
+
+        use_sliding_windows = (getattr(self.config, 'sliding_window', None)
+                               is not None and self.config.use_sliding_window)
+        window_size = self.config.sliding_window
+        if not use_sliding_windows:
+            window_size = -1
+        paged_attention_fwd(
+            query_states,
+            key_states,
+            value_states,
+            past_key_value[0],
+            past_key_value[1],
+            attn_output,
+            block_offsets,
+            q_start_loc=q_start_loc,
+            q_seqlens=q_seq_length,
+            kv_seqlens=kv_seq_length,
+            max_seqlen=max_q_seq_length,
+            window_size=window_size,
+            context=context,
+        )
+
+        attn_output = attn_output.reshape(*hidden_states.shape[:-1],
+                                          hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, None, past_key_value
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_value: Optional[Tuple[torch.Tensor]] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        **kwargs,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor],
+               Optional[Tuple[torch.Tensor]]]:
+        """Rewrite of forward."""
+        world_size = 1
+        if dist.is_initialized():
+            world_size = dist.get_world_size()
+        return self._contiguous_batching_forward_impl(
+            hidden_states,
+            position_ids,
+            past_key_value,
+            world_size=world_size,
+        )
diff --git a/lmdeploy/pytorch/models/qwen2_moe.py b/lmdeploy/pytorch/models/qwen2_moe.py
@@ -7,6 +7,8 @@
 from torch import nn
 
 from lmdeploy.pytorch.kernels.fused_moe import fused_moe
+from lmdeploy.pytorch.kernels.moe_gating_topk_softmax import \
+    moe_gating_topk_softmax
 
 
 class PatchedQwen2MoeSparseMoeBlock(nn.Module):
@@ -90,6 +92,49 @@ def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return out_states, router_logits
 
 
+class PatchedQwen2MoeSparseMoeBlockAscend(nn.Module):
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """"""
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        router_logits = self.gate(hidden_states)
+
+        routing_weights, selected_experts = moe_gating_topk_softmax(
+            router_logits, self.top_k)
+        if self.norm_topk_prob:
+            routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        out_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim),
+            dtype=hidden_states.dtype,
+            device=hidden_states.device)
+
+        expert_mask = torch.nn.functional.one_hot(
+            selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        for expert_idx in range(self.num_experts):
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx])
+
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(
+                current_state) * routing_weights[top_x, idx, None]
+
+            out_states.index_add_(
+                0, top_x, current_hidden_states.to(hidden_states.dtype))
+
+        shared_expert_output = self.shared_expert(hidden_states)
+        shared_expert_output = F.sigmoid(
+            self.shared_expert_gate(hidden_states)) * shared_expert_output
+
+        out_states = out_states + shared_expert_output
+        out_states = out_states.unflatten(0, (-1, sequence_length))
+
+        return out_states, router_logits
+
+
 class PatchedQwen2MoeModel(nn.Module):
 
     def _continuous_batching_forward(