From 279874f191d75bb354d9b82f316826d2c9f9afb3 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Wed, 10 Sep 2025 16:12:22 +0800 Subject: [PATCH 1/2] [llvm][LoongArch] Introduce LASX and LSX conversion intrinsics This patch introduces the LASX and LSX conversion intrinsics: - <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) - <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) - <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) - <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) - <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) - <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) - <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) - <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) - <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) - <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) - <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) - <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) - <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) - <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) --- llvm/include/llvm/IR/IntrinsicsLoongArch.td | 38 +++ .../LoongArch/LoongArchISelLowering.cpp | 5 + .../LoongArch/LoongArchLASXInstrInfo.td | 31 +++ .../LoongArch/lasx/intrinsic-conversion.ll | 234 ++++++++++++++++++ 4 files changed, 308 insertions(+) create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td index 84026aa9d3624..1c46965d995fe 100644 --- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td +++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td @@ -1192,4 +1192,42 @@ def int_loongarch_lasx_xvstelm_w def int_loongarch_lasx_xvstelm_d : VecInt<[], [llvm_v4i64_ty, llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty], [IntrWriteMem, IntrArgMemOnly, ImmArg>, ImmArg>]>; + +// LASX and LSX conversion +def int_loongarch_lasx_cast_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_cast_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_s + : VecInt<[llvm_v8f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128_d + : VecInt<[llvm_v4f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_concat_128 + : VecInt<[llvm_v4i64_ty], [llvm_v2i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_lo + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_s + : VecInt<[llvm_v4f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi_d + : VecInt<[llvm_v2f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_extract_128_hi + : VecInt<[llvm_v2i64_ty], [llvm_v4i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_lo + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_s + : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v4f32_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi_d + : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v2f64_ty], [IntrNoMem]>; +def int_loongarch_lasx_insert_128_hi + : VecInt<[llvm_v4i64_ty], [llvm_v4i64_ty, llvm_v2i64_ty], [IntrNoMem]>; } // TargetPrefix = "loongarch" diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index 80c96c6dc8eb6..9f11e0f23cdff 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -6612,6 +6612,11 @@ performINTRINSIC_WO_CHAINCombine(SDNode *N, SelectionDAG &DAG, return DAG.getNode(LoongArchISD::VANY_NONZERO, DL, N->getValueType(0), N->getOperand(1)); break; + case Intrinsic::loongarch_lasx_concat_128_s: + case Intrinsic::loongarch_lasx_concat_128_d: + case Intrinsic::loongarch_lasx_concat_128: + return DAG.getNode(ISD::CONCAT_VECTORS, DL, N->getValueType(0), + N->getOperand(1), N->getOperand(2)); } return SDValue(); } diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td index 613dea6093f5f..73fa204f4739d 100644 --- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td +++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td @@ -2088,6 +2088,37 @@ defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; defm : subvector_subreg_lowering; +// LASX and LSX conversion +def : Pat<(int_loongarch_lasx_cast_128_s (v4f32 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128_d (v2f64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_cast_128 (v2i64 LSX128:$src)), + (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_lo (v4i64 LASX256:$src)), + (EXTRACT_SUBREG LASX256:$src, sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_s (v8f32 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi_d (v4f64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_extract_128_hi (v4i64 LASX256:$src)), + (EXTRACT_SUBREG (XVPERMI_Q (IMPLICIT_DEF), LASX256:$src, 1), sub_128)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_lo (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 48)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_s (v8f32 LASX256:$src), (v4f32 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi_d (v4f64 LASX256:$src), (v2f64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; +def : Pat<(int_loongarch_lasx_insert_128_hi (v4i64 LASX256:$src), (v2i64 LSX128:$lo)), + (XVPERMI_Q LASX256:$src, (INSERT_SUBREG (IMPLICIT_DEF), LSX128:$lo, sub_128), 2)>; } // Predicates = [HasExtLASX] /// Intrinsic pattern diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll new file mode 100644 index 0000000000000..7b418cf342714 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll @@ -0,0 +1,234 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) + +define <8 x float> @lasx_cast_128_s(<4 x float> %va) { +; CHECK-LABEL: lasx_cast_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: ret +entry: + %res = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %va) + ret <8 x float> %res +} + +declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) + +define <4 x double> @lasx_cast_128_d(<2 x double> %va) { +; CHECK-LABEL: lasx_cast_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: ret +entry: + %res = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %va) + ret <4 x double> %res +} + +declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) + +define <4 x i64> @lasx_cast_128(<2 x i64> %va) { +; CHECK-LABEL: lasx_cast_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %va) + ret <4 x i64> %res +} + +declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) + +define <8 x float> @lasx_concat_128_s(<4 x float> %va, <4 x float> %vb) { +; CHECK-LABEL: lasx_concat_128_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %va, <4 x float> %vb) + ret <8 x float> %res +} + +declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) + +define <4 x double> @lasx_concat_128_d(<2 x double> %va, <2 x double> %vb) { +; CHECK-LABEL: lasx_concat_128_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %va, <2 x double> %vb) + ret <4 x double> %res +} + +declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) + +define <4 x i64> @lasx_concat_128(<2 x i64> %va, <2 x i64> %vb) { +; CHECK-LABEL: lasx_concat_128: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %va, <2 x i64> %vb) + ret <4 x i64> %res +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) + +define <4 x float> @lasx_extract_128_lo_s(<8 x float> %va) { +; CHECK-LABEL: lasx_extract_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %va) + ret <4 x float> %res +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) + +define <2 x double> @lasx_extract_128_lo_d(<4 x double> %va) { +; CHECK-LABEL: lasx_extract_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %va) + ret <2 x double> %res +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) + +define <2 x i64> @lasx_extract_128_lo(<4 x i64> %va) { +; CHECK-LABEL: lasx_extract_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %va) + ret <2 x i64> %res +} + +declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) + +define <4 x float> @lasx_extract_128_hi_s(<8 x float> %va) { +; CHECK-LABEL: lasx_extract_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %va) + ret <4 x float> %res +} + +declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) + +define <2 x double> @lasx_extract_128_hi_d(<4 x double> %va) { +; CHECK-LABEL: lasx_extract_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %va) + ret <2 x double> %res +} + +declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) + +define <2 x i64> @lasx_extract_128_hi(<4 x i64> %va) { +; CHECK-LABEL: lasx_extract_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 +; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: ret +entry: + %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %va) + ret <2 x i64> %res +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) + +define <8 x float> @lasx_insert_128_lo_s(<8 x float> %va, <4 x float> %vb) { +; CHECK-LABEL: lasx_insert_128_lo_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: ret +entry: + %res = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %va, <4 x float> %vb) + ret <8 x float> %res +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) + +define <4 x double> @lasx_insert_128_lo_d(<4 x double> %va, <2 x double> %vb) { +; CHECK-LABEL: lasx_insert_128_lo_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: ret +entry: + %res = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %va, <2 x double> %vb) + ret <4 x double> %res +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) + +define <4 x i64> @lasx_insert_128_lo(<4 x i64> %va, <2 x i64> %vb) { +; CHECK-LABEL: lasx_insert_128_lo: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %va, <2 x i64> %vb) + ret <4 x i64> %res +} + +declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) + +define <8 x float> @lasx_insert_128_hi_s(<8 x float> %va, <4 x float> %vb) { +; CHECK-LABEL: lasx_insert_128_hi_s: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %va, <4 x float> %vb) + ret <8 x float> %res +} + +declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) + +define <4 x double> @lasx_insert_128_hi_d(<4 x double> %va, <2 x double> %vb) { +; CHECK-LABEL: lasx_insert_128_hi_d: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %va, <2 x double> %vb) + ret <4 x double> %res +} + +declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) + +define <4 x i64> @lasx_insert_128_hi(<4 x i64> %va, <2 x i64> %vb) { +; CHECK-LABEL: lasx_insert_128_hi: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: ret +entry: + %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %va, <2 x i64> %vb) + ret <4 x i64> %res +} From 4a86c1dd773dc5d39c4cb2a1105cf5a9b6b16c95 Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Tue, 4 Nov 2025 20:16:39 +0800 Subject: [PATCH 2/2] Address Weining's comment --- .../LoongArch/lasx/intrinsic-conversion.ll | 219 ++++++++++++------ 1 file changed, 144 insertions(+), 75 deletions(-) diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll index 7b418cf342714..006713ccabf47 100644 --- a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll +++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-conversion.ll @@ -4,231 +4,300 @@ declare <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float>) -define <8 x float> @lasx_cast_128_s(<4 x float> %va) { +define void @lasx_cast_128_s(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_cast_128_s: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %va) - ret <8 x float> %res + %a = load <4 x float>, ptr %va + %b = call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> %a) + store <8 x float> %b, ptr %vd + ret void } declare <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double>) -define <4 x double> @lasx_cast_128_d(<2 x double> %va) { +define void @lasx_cast_128_d(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_cast_128_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %va) - ret <4 x double> %res + %a = load <2 x double>, ptr %va + %b = call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> %a) + store <4 x double> %b, ptr %vd + ret void } declare <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64>) -define <4 x i64> @lasx_cast_128(<2 x i64> %va) { +define void @lasx_cast_128(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_cast_128: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %va) - ret <4 x i64> %res + %a = load <2 x i64>, ptr %va + %b = call <4 x i64> @llvm.loongarch.lasx.cast.128(<2 x i64> %a) + store <4 x i64> %b, ptr %vd + ret void } declare <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float>, <4 x float>) -define <8 x float> @lasx_concat_128_s(<4 x float> %va, <4 x float> %vb) { +define void @lasx_concat_128_s(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_concat_128_s: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %va, <4 x float> %vb) - ret <8 x float> %res + %a = load <4 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void } declare <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double>, <2 x double>) -define <4 x double> @lasx_concat_128_d(<2 x double> %va, <2 x double> %vb) { +define void @lasx_concat_128_d(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_concat_128_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %va, <2 x double> %vb) - ret <4 x double> %res + %a = load <2 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void } declare <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64>, <2 x i64>) -define <4 x i64> @lasx_concat_128(<2 x i64> %va, <2 x i64> %vb) { +define void @lasx_concat_128(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_concat_128: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 def $xr0 +; CHECK-NEXT: vld $vr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %va, <2 x i64> %vb) - ret <4 x i64> %res + %a = load <2 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.concat.128(<2 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void } declare <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float>) -define <4 x float> @lasx_extract_128_lo_s(<8 x float> %va) { +define void @lasx_extract_128_lo_s(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_lo_s: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %va) - ret <4 x float> %res + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void } declare <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double>) -define <2 x double> @lasx_extract_128_lo_d(<4 x double> %va) { +define void @lasx_extract_128_lo_d(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_lo_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %va) - ret <2 x double> %res + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void } declare <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64>) -define <2 x i64> @lasx_extract_128_lo(<4 x i64> %va) { +define void @lasx_extract_128_lo(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_lo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %va) - ret <2 x i64> %res + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.lo(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void } declare <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float>) -define <4 x float> @lasx_extract_128_hi_s(<8 x float> %va) { +define void @lasx_extract_128_hi_s(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_hi_s: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %va) - ret <4 x float> %res + %a = load <8 x float>, ptr %va + %c = call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> %a) + store <4 x float> %c, ptr %vd + ret void } declare <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double>) -define <2 x double> @lasx_extract_128_hi_d(<4 x double> %va) { +define void @lasx_extract_128_hi_d(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_hi_d: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %va) - ret <2 x double> %res + %a = load <4 x double>, ptr %va + %c = call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> %a) + store <2 x double> %c, ptr %vd + ret void } declare <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64>) -define <2 x i64> @lasx_extract_128_hi(<4 x i64> %va) { +define void @lasx_extract_128_hi(ptr %vd, ptr %va) { ; CHECK-LABEL: lasx_extract_128_hi: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a1, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr0, 1 -; CHECK-NEXT: # kill: def $vr0 killed $vr0 killed $xr0 +; CHECK-NEXT: vst $vr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %va) - ret <2 x i64> %res + %a = load <4 x i64>, ptr %va + %c = call <2 x i64> @llvm.loongarch.lasx.extract.128.hi(<4 x i64> %a) + store <2 x i64> %c, ptr %vd + ret void } declare <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float>, <4 x float>) -define <8 x float> @lasx_insert_128_lo_s(<8 x float> %va, <4 x float> %vb) { +define void @lasx_insert_128_lo_s(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_lo_s: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %va, <4 x float> %vb) - ret <8 x float> %res + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void } declare <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double>, <2 x double>) -define <4 x double> @lasx_insert_128_lo_d(<4 x double> %va, <2 x double> %vb) { +define void @lasx_insert_128_lo_d(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_lo_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %va, <2 x double> %vb) - ret <4 x double> %res + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void } declare <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64>, <2 x i64>) -define <4 x i64> @lasx_insert_128_lo(<4 x i64> %va, <2 x i64> %vb) { +define void @lasx_insert_128_lo(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_lo: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 48 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %va, <2 x i64> %vb) - ret <4 x i64> %res + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.lo(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void } declare <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float>, <4 x float>) -define <8 x float> @lasx_insert_128_hi_s(<8 x float> %va, <4 x float> %vb) { +define void @lasx_insert_128_hi_s(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_hi_s: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %va, <4 x float> %vb) - ret <8 x float> %res + %a = load <8 x float>, ptr %va + %b = load <4 x float>, ptr %vb + %c = call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> %a, <4 x float> %b) + store <8 x float> %c, ptr %vd + ret void } declare <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double>, <2 x double>) -define <4 x double> @lasx_insert_128_hi_d(<4 x double> %va, <2 x double> %vb) { +define void @lasx_insert_128_hi_d(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_hi_d: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %va, <2 x double> %vb) - ret <4 x double> %res + %a = load <4 x double>, ptr %va + %b = load <2 x double>, ptr %vb + %c = call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> %a, <2 x double> %b) + store <4 x double> %c, ptr %vd + ret void } declare <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64>, <2 x i64>) -define <4 x i64> @lasx_insert_128_hi(<4 x i64> %va, <2 x i64> %vb) { +define void @lasx_insert_128_hi(ptr %vd, ptr %va, ptr %vb) { ; CHECK-LABEL: lasx_insert_128_hi: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: # kill: def $vr1 killed $vr1 def $xr1 +; CHECK-NEXT: xvld $xr0, $a1, 0 +; CHECK-NEXT: vld $vr1, $a2, 0 ; CHECK-NEXT: xvpermi.q $xr0, $xr1, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 ; CHECK-NEXT: ret entry: - %res = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %va, <2 x i64> %vb) - ret <4 x i64> %res + %a = load <4 x i64>, ptr %va + %b = load <2 x i64>, ptr %vb + %c = call <4 x i64> @llvm.loongarch.lasx.insert.128.hi(<4 x i64> %a, <2 x i64> %b) + store <4 x i64> %c, ptr %vd + ret void }