From 279874f191d75bb354d9b82f316826d2c9f9afb3 Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Wed, 10 Sep 2025 16:12:22 +0800
Subject: [PATCH 1/2] [llvm][LoongArch] Introduce LASX and LSX conversion
 intrinsics

This patch introduces the LASX and LSX conversion intrinsics:

- <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
- <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
- <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
- <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
- <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
- <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
- <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
- <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
- <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
- <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
- <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
- <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
- <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
- <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
- <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
- <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
- <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
- <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
---
 llvm/include/llvm/IR/IntrinsicsLoongArch.td   |  38 +++
 .../LoongArch/LoongArchISelLowering.cpp       |   5 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |  31 +++
 .../LoongArch/lasx/intrinsic-conversion.ll    | 234 ++++++++++++++++++
 4 files changed, 308 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 84026aa9d3624..1c46965d995fe 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w
 def int_loongarch_lasx_xvstelm_d
   : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty],
            [IntrWriteMem, IntrArgMemOnly, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
+// LASX and LSX conversion
+def int_loongarch_lasx_cast_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_cast_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_concat_128
+  : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_lo
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_s
+  : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi_d
+  : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_extract_128_hi
+  : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_lo
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_s
+  : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi_d
+  : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>;
+def int_loongarch_lasx_insert_128_hi
+  : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 80c96c6dc8eb6..9f11e0f23cdff 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -6612,6 +6612,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0),
                          N->getOperand(1));
     break;
+  case Intrinsic::loongarch_lasx_concat_128_s:
+  case Intrinsic::loongarch_lasx_concat_128_d:
+  case Intrinsic::loongarch_lasx_concat_128:
+    return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2));
   }
   return SDValue();
 }
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 613dea6093f5f..73fa204f4739d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -2088,6 +2088,37 @@ defm : subvector_subreg_lowering<LSX128, v2f64, LASX256, v4f64,  2,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v8i16, LASX256, v16i16, 8,  sub_128>;
 defm : subvector_subreg_lowering<LSX128, v16i8, LASX256, v32i8,  16, sub_128>;
 
+// LASX and LSX conversion
+def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)),
+          (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG LASX256:$src, sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)),
+          (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
+def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)),
+          (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>;
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
new file mode 100644
index 0000000000000..7b418cf342714
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -0,0 +1,234 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
+
+define <8 x float> @lasx_cast_128_s(<4 x float> %va) {
+; CHECK-LABEL: lasx_cast_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
+
+define <4 x double> @lasx_cast_128_d(<2 x double> %va) {
+; CHECK-LABEL: lasx_cast_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %va)
+  ret <4 x double> %res
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
+
+define <4 x i64> @lasx_cast_128(<2 x i64> %va) {
+; CHECK-LABEL: lasx_cast_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %va)
+  ret <4 x i64> %res
+}
+
+declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
+
+define <8 x float> @lasx_concat_128_s(<4 x float> %va, <4 x float> %vb) {
+; CHECK-LABEL: lasx_concat_128_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %va, <4 x float> %vb)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
+
+define <4 x double> @lasx_concat_128_d(<2 x double> %va, <2 x double> %vb) {
+; CHECK-LABEL: lasx_concat_128_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %va, <2 x double> %vb)
+  ret <4 x double> %res
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
+
+define <4 x i64> @lasx_concat_128(<2 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: lasx_concat_128:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %va, <2 x i64> %vb)
+  ret <4 x i64> %res
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
+
+define <4 x float> @lasx_extract_128_lo_s(<8 x float> %va) {
+; CHECK-LABEL: lasx_extract_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
+
+define <2 x double> @lasx_extract_128_lo_d(<4 x double> %va) {
+; CHECK-LABEL: lasx_extract_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %va)
+  ret <2 x double> %res
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
+
+define <2 x i64> @lasx_extract_128_lo(<4 x i64> %va) {
+; CHECK-LABEL: lasx_extract_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %va)
+  ret <2 x i64> %res
+}
+
+declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
+
+define <4 x float> @lasx_extract_128_hi_s(<8 x float> %va) {
+; CHECK-LABEL: lasx_extract_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
+
+define <2 x double> @lasx_extract_128_hi_d(<4 x double> %va) {
+; CHECK-LABEL: lasx_extract_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %va)
+  ret <2 x double> %res
+}
+
+declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
+
+define <2 x i64> @lasx_extract_128_hi(<4 x i64> %va) {
+; CHECK-LABEL: lasx_extract_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
+; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %va)
+  ret <2 x i64> %res
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
+
+define <8 x float> @lasx_insert_128_lo_s(<8 x float> %va, <4 x float> %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %va, <4 x float> %vb)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
+
+define <4 x double> @lasx_insert_128_lo_d(<4 x double> %va, <2 x double> %vb) {
+; CHECK-LABEL: lasx_insert_128_lo_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %va, <2 x double> %vb)
+  ret <4 x double> %res
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
+
+define <4 x i64> @lasx_insert_128_lo(<4 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: lasx_insert_128_lo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %va, <2 x i64> %vb)
+  ret <4 x i64> %res
+}
+
+declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
+
+define <8 x float> @lasx_insert_128_hi_s(<8 x float> %va, <4 x float> %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %va, <4 x float> %vb)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
+
+define <4 x double> @lasx_insert_128_hi_d(<4 x double> %va, <2 x double> %vb) {
+; CHECK-LABEL: lasx_insert_128_hi_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %va, <2 x double> %vb)
+  ret <4 x double> %res
+}
+
+declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
+
+define <4 x i64> @lasx_insert_128_hi(<4 x i64> %va, <2 x i64> %vb) {
+; CHECK-LABEL: lasx_insert_128_hi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %va, <2 x i64> %vb)
+  ret <4 x i64> %res
+}

From 4a86c1dd773dc5d39c4cb2a1105cf5a9b6b16c95 Mon Sep 17 00:00:00 2001
From: WANG Rui <wangrui@loongson.cn>
Date: Tue, 4 Nov 2025 20:16:39 +0800
Subject: [PATCH 2/2] Address Weining's comment

---
 .../LoongArch/lasx/intrinsic-conversion.ll    | 219 ++++++++++++------
 1 file changed, 144 insertions(+), 75 deletions(-)

diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
index 7b418cf342714..006713ccabf47 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll
@@ -4,231 +4,300 @@
 
 declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>)
 
-define <8 x float> @lasx_cast_128_s(<4 x float> %va) {
+define void @lasx_cast_128_s(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_cast_128_s:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %va)
-  ret <8 x float> %res
+  %a = load <4 x float>, ptr %va
+  %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a)
+  store <8 x float> %b, ptr %vd
+  ret void
 }
 
 declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>)
 
-define <4 x double> @lasx_cast_128_d(<2 x double> %va) {
+define void @lasx_cast_128_d(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_cast_128_d:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %va)
-  ret <4 x double> %res
+  %a = load <2 x double>, ptr %va
+  %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a)
+  store <4 x double> %b, ptr %vd
+  ret void
 }
 
 declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>)
 
-define <4 x i64> @lasx_cast_128(<2 x i64> %va) {
+define void @lasx_cast_128(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_cast_128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %va)
-  ret <4 x i64> %res
+  %a = load <2 x i64>, ptr %va
+  %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a)
+  store <4 x i64> %b, ptr %vd
+  ret void
 }
 
 declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>)
 
-define <8 x float> @lasx_concat_128_s(<4 x float> %va, <4 x float> %vb) {
+define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_concat_128_s:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %va, <4 x float> %vb)
-  ret <8 x float> %res
+  %a = load <4 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
 }
 
 declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>)
 
-define <4 x double> @lasx_concat_128_d(<2 x double> %va, <2 x double> %vb) {
+define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_concat_128_d:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %va, <2 x double> %vb)
-  ret <4 x double> %res
+  %a = load <2 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
 }
 
 declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>)
 
-define <4 x i64> @lasx_concat_128(<2 x i64> %va, <2 x i64> %vb) {
+define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_concat_128:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 def $xr0
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %va, <2 x i64> %vb)
-  ret <4 x i64> %res
+  %a = load <2 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
 }
 
 declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>)
 
-define <4 x float> @lasx_extract_128_lo_s(<8 x float> %va) {
+define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_lo_s:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %va)
-  ret <4 x float> %res
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
 }
 
 declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>)
 
-define <2 x double> @lasx_extract_128_lo_d(<4 x double> %va) {
+define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_lo_d:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %va)
-  ret <2 x double> %res
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
 }
 
 declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>)
 
-define <2 x i64> @lasx_extract_128_lo(<4 x i64> %va) {
+define void @lasx_extract_128_lo(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_lo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %va)
-  ret <2 x i64> %res
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
 }
 
 declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>)
 
-define <4 x float> @lasx_extract_128_hi_s(<8 x float> %va) {
+define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_hi_s:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %va)
-  ret <4 x float> %res
+  %a = load <8 x float>, ptr %va
+  %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a)
+  store <4 x float> %c, ptr %vd
+  ret void
 }
 
 declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>)
 
-define <2 x double> @lasx_extract_128_hi_d(<4 x double> %va) {
+define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_hi_d:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %va)
-  ret <2 x double> %res
+  %a = load <4 x double>, ptr %va
+  %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a)
+  store <2 x double> %c, ptr %vd
+  ret void
 }
 
 declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>)
 
-define <2 x i64> @lasx_extract_128_hi(<4 x i64> %va) {
+define void @lasx_extract_128_hi(ptr %vd, ptr %va) {
 ; CHECK-LABEL: lasx_extract_128_hi:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld $xr0, $a1, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr0, 1
-; CHECK-NEXT:    # kill: def $vr0 killed $vr0 killed $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %va)
-  ret <2 x i64> %res
+  %a = load <4 x i64>, ptr %va
+  %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a)
+  store <2 x i64> %c, ptr %vd
+  ret void
 }
 
 declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>)
 
-define <8 x float> @lasx_insert_128_lo_s(<8 x float> %va, <4 x float> %vb) {
+define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_lo_s:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %va, <4 x float> %vb)
-  ret <8 x float> %res
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
 }
 
 declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>)
 
-define <4 x double> @lasx_insert_128_lo_d(<4 x double> %va, <2 x double> %vb) {
+define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_lo_d:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %va, <2 x double> %vb)
-  ret <4 x double> %res
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
 }
 
 declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>)
 
-define <4 x i64> @lasx_insert_128_lo(<4 x i64> %va, <2 x i64> %vb) {
+define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_lo:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 48
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %va, <2 x i64> %vb)
-  ret <4 x i64> %res
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
 }
 
 declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>)
 
-define <8 x float> @lasx_insert_128_hi_s(<8 x float> %va, <4 x float> %vb) {
+define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_hi_s:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %va, <4 x float> %vb)
-  ret <8 x float> %res
+  %a = load <8 x float>, ptr %va
+  %b = load <4 x float>, ptr %vb
+  %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b)
+  store <8 x float> %c, ptr %vd
+  ret void
 }
 
 declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>)
 
-define <4 x double> @lasx_insert_128_hi_d(<4 x double> %va, <2 x double> %vb) {
+define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_hi_d:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %va, <2 x double> %vb)
-  ret <4 x double> %res
+  %a = load <4 x double>, ptr %va
+  %b = load <2 x double>, ptr %vb
+  %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b)
+  store <4 x double> %c, ptr %vd
+  ret void
 }
 
 declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>)
 
-define <4 x i64> @lasx_insert_128_hi(<4 x i64> %va, <2 x i64> %vb) {
+define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) {
 ; CHECK-LABEL: lasx_insert_128_hi:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    # kill: def $vr1 killed $vr1 def $xr1
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    vld $vr1, $a2, 0
 ; CHECK-NEXT:    xvpermi.q $xr0, $xr1, 2
+; CHECK-NEXT:    xvst $xr0, $a0, 0
 ; CHECK-NEXT:    ret
 entry:
-  %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %va, <2 x i64> %vb)
-  ret <4 x i64> %res
+  %a = load <4 x i64>, ptr %va
+  %b = load <2 x i64>, ptr %vb
+  %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b)
+  store <4 x i64> %c, ptr %vd
+  ret void
 }