diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index bfc061b404560..9845587968332 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -16609,8 +16609,7 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { // Fold freeze(op(x, ...)) -> op(freeze(x), ...). // Try to push freeze through instructions that propagate but don't produce // poison as far as possible. If an operand of freeze follows three - // conditions 1) one-use, 2) does not produce poison, and 3) has all but one - // guaranteed-non-poison operands (or is a BUILD_VECTOR or similar) then push + // conditions 1) one-use, and 2) does not produce poison then push // the freeze through to the operands that are not guaranteed non-poison. // NOTE: we will strip poison-generating flags, so ignore them here. if (DAG.canCreateUndefOrPoison(N0, /*PoisonOnly*/ false, @@ -16618,13 +16617,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { N0->getNumValues() != 1 || !N0->hasOneUse()) return SDValue(); - bool AllowMultipleMaybePoisonOperands = - N0.getOpcode() == ISD::SELECT_CC || N0.getOpcode() == ISD::SETCC || - N0.getOpcode() == ISD::BUILD_VECTOR || - N0.getOpcode() == ISD::BUILD_PAIR || - N0.getOpcode() == ISD::VECTOR_SHUFFLE || - N0.getOpcode() == ISD::CONCAT_VECTORS || N0.getOpcode() == ISD::FMUL; - // Avoid turning a BUILD_VECTOR that can be recognized as "all zeros", "all // ones" or "constant" into something that depends on FrozenUndef. We can // instead pick undef values to keep those properties, while at the same time @@ -16657,10 +16649,6 @@ SDValue DAGCombiner::visitFREEZE(SDNode *N) { MaybePoisonOperandNumbers.push_back(OpNo); if (!HadMaybePoisonOperands) continue; - if (IsNewMaybePoisonOperand && !AllowMultipleMaybePoisonOperands) { - // Multiple maybe-poison ops when not allowed - bail out. - return SDValue(); - } } // NOTE: the whole op may be not guaranteed to not be undef or poison because // it could create undef or poison due to it's poison-generating flags. @@ -23184,13 +23172,16 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { // Ensure all the operands are the same value type, fill any missing // operands with UNDEF and create the BUILD_VECTOR. - auto CanonicalizeBuildVector = [&](SmallVectorImpl &Ops) { + auto CanonicalizeBuildVector = [&](SmallVectorImpl &Ops, + bool FreezeUndef = false) { assert(Ops.size() == NumElts && "Unexpected vector size"); + SDValue UndefOp = FreezeUndef ? DAG.getFreeze(DAG.getUNDEF(MaxEltVT)) + : DAG.getUNDEF(MaxEltVT); for (SDValue &Op : Ops) { if (Op) Op = VT.isInteger() ? DAG.getAnyExtOrTrunc(Op, DL, MaxEltVT) : Op; else - Op = DAG.getUNDEF(MaxEltVT); + Op = UndefOp; } return DAG.getBuildVector(VT, DL, Ops); }; @@ -23204,6 +23195,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { if (CurVec.isUndef()) return CanonicalizeBuildVector(Ops); + // FREEZE(UNDEF) - build new BUILD_VECTOR from already inserted operands. + if (ISD::isFreezeUndef(CurVec.getNode()) && CurVec.hasOneUse()) + return CanonicalizeBuildVector(Ops, /*FreezeUndef=*/true); + // BUILD_VECTOR - insert unused operands and build new BUILD_VECTOR. if (CurVec.getOpcode() == ISD::BUILD_VECTOR && CurVec.hasOneUse()) { for (unsigned I = 0; I != NumElts; ++I) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 326dd7149ef96..30758345ab0f6 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -3237,6 +3237,9 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, SDValue Src = Op.getOperand(0); + // Freeze the source since we are increasing the number of uses. + Src = DAG.getFreeze(Src); + MVT ContainerVT = VT; if (VT.isFixedLengthVector()) { ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget); @@ -3254,9 +3257,6 @@ lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(SDValue Op, SelectionDAG &DAG, std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget); } - // Freeze the source since we are increasing the number of uses. - Src = DAG.getFreeze(Src); - // We do the conversion on the absolute value and fix the sign at the end. SDValue Abs = DAG.getNode(RISCVISD::FABS_VL, DL, ContainerVT, Src, Mask, VL); diff --git a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll index 830f1efbdf67f..5bd9ec2afdad1 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-propagate-sd-flags.ll @@ -12,7 +12,7 @@ ; CHECK: Legally typed node: [[VTWOA]]: v2f64 = BUILD_VECTOR ; CHECK: Legalizing node: [[VTWOB:t.*]]: v2f64 = BUILD_VECTOR ; CHECK: Legally typed node: [[VTWOB]]: v2f64 = BUILD_VECTOR -; CHECK: Legalizing node: t30: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] +; CHECK: Legalizing node: t31: v2f64 = fmaxnum nnan reassoc [[VTWOB]], [[VTWOA]] target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" target triple = "aarch64--linux-gnu" diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 1396099dbfa6a..a125880fd46a2 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -479,21 +479,28 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -504,6 +511,7 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1036,10 +1044,10 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -2654,21 +2662,28 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -2679,6 +2694,7 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -3211,10 +3227,10 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index d29a7a2dc5656..ab4e7e50539f6 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -517,21 +517,28 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -542,6 +549,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -1074,10 +1082,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) @@ -1890,21 +1898,28 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; implicit-def: $sgpr8 ; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v7 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[8:9] ; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[12:13] +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[14:15], s[6:7] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] ; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] ; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 @@ -1915,6 +1930,7 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: s_mov_b32 s14, s13 ; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 +; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 ; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec @@ -2447,10 +2463,10 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 s[6:7], 1 ; GFX9-O0-NEXT: s_mov_b32 s5, s6 ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/NVPTX/i1-select.ll b/llvm/test/CodeGen/NVPTX/i1-select.ll index 6fb5aad4b1eb9..562c746200d87 100644 --- a/llvm/test/CodeGen/NVPTX/i1-select.ll +++ b/llvm/test/CodeGen/NVPTX/i1-select.ll @@ -94,27 +94,27 @@ define i32 @test_select_i1_basic(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %fals define i32 @test_select_i1_basic_folding(i32 %v1, i32 %v2, i32 %v3, i32 %true, i32 %false) { ; CHECK-LABEL: test_select_i1_basic_folding( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<13>; -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .pred %p<12>; +; CHECK-NEXT: .reg .b32 %r<9>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [test_select_i1_basic_folding_param_0]; ; CHECK-NEXT: setp.eq.s32 %p1, %r1, 0; -; CHECK-NEXT: ld.param.b32 %r2, [test_select_i1_basic_folding_param_1]; -; CHECK-NEXT: setp.ne.s32 %p2, %r2, 0; -; CHECK-NEXT: setp.eq.s32 %p3, %r2, 0; -; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_2]; -; CHECK-NEXT: setp.eq.s32 %p4, %r3, 0; -; CHECK-NEXT: ld.param.b32 %r4, [test_select_i1_basic_folding_param_3]; +; CHECK-NEXT: ld.param.b32 %r3, [test_select_i1_basic_folding_param_1]; +; CHECK-NEXT: setp.ne.s32 %p2, %r3, 0; +; CHECK-NEXT: setp.eq.s32 %p3, %r3, 0; +; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_2]; +; CHECK-NEXT: setp.eq.s32 %p4, %r5, 0; +; CHECK-NEXT: ld.param.b32 %r6, [test_select_i1_basic_folding_param_3]; ; CHECK-NEXT: xor.pred %p6, %p1, %p3; -; CHECK-NEXT: ld.param.b32 %r5, [test_select_i1_basic_folding_param_4]; +; CHECK-NEXT: ld.param.b32 %r7, [test_select_i1_basic_folding_param_4]; ; CHECK-NEXT: and.pred %p7, %p6, %p4; -; CHECK-NEXT: and.pred %p9, %p2, %p4; -; CHECK-NEXT: and.pred %p10, %p3, %p7; -; CHECK-NEXT: or.pred %p11, %p10, %p9; -; CHECK-NEXT: xor.pred %p12, %p11, %p3; -; CHECK-NEXT: selp.b32 %r6, %r4, %r5, %p12; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: and.pred %p8, %p2, %p4; +; CHECK-NEXT: and.pred %p9, %p3, %p7; +; CHECK-NEXT: or.pred %p10, %p9, %p8; +; CHECK-NEXT: xor.pred %p11, %p10, %p3; +; CHECK-NEXT: selp.b32 %r8, %r6, %r7, %p11; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; ; CHECK-NEXT: ret; %b1 = icmp eq i32 %v1, 0 %b2 = icmp eq i32 %v2, 0 diff --git a/llvm/test/CodeGen/NVPTX/i128.ll b/llvm/test/CodeGen/NVPTX/i128.ll index 29408a24213cc..49dbc5c385dc0 100644 --- a/llvm/test/CodeGen/NVPTX/i128.ll +++ b/llvm/test/CodeGen/NVPTX/i128.ll @@ -5,9 +5,9 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: srem_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<20>; +; CHECK-NEXT: .reg .pred %p<22>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<127>; +; CHECK-NEXT: .reg .b64 %rd<126>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [srem_i128_param_0]; @@ -42,102 +42,103 @@ define i128 @srem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd62, %r4; ; CHECK-NEXT: add.s64 %rd63, %rd62, 64; ; CHECK-NEXT: selp.b64 %rd64, %rd61, %rd63, %p7; -; CHECK-NEXT: mov.b64 %rd117, 0; +; CHECK-NEXT: mov.b64 %rd116, 0; ; CHECK-NEXT: sub.cc.s64 %rd66, %rd60, %rd64; -; CHECK-NEXT: subc.cc.s64 %rd67, %rd117, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd66, 127; -; CHECK-NEXT: setp.eq.s64 %p9, %rd67, 0; -; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.s64 %p11, %rd67, 0; -; CHECK-NEXT: or.pred %p12, %p10, %p11; -; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd68, %rd66, 127; -; CHECK-NEXT: or.b64 %rd69, %rd68, %rd67; -; CHECK-NEXT: setp.eq.s64 %p14, %rd69, 0; -; CHECK-NEXT: selp.b64 %rd126, 0, %rd4, %p13; -; CHECK-NEXT: selp.b64 %rd125, 0, %rd3, %p13; -; CHECK-NEXT: or.pred %p15, %p13, %p14; -; CHECK-NEXT: @%p15 bra $L__BB0_5; +; CHECK-NEXT: subc.cc.s64 %rd8, %rd116, 0; +; CHECK-NEXT: setp.ne.s64 %p8, %rd8, 0; +; CHECK-NEXT: and.pred %p10, %p8, %p8; +; CHECK-NEXT: setp.eq.s64 %p11, %rd8, 0; +; CHECK-NEXT: setp.gt.u64 %p12, %rd66, 127; +; CHECK-NEXT: and.pred %p13, %p11, %p12; +; CHECK-NEXT: or.pred %p14, %p13, %p10; +; CHECK-NEXT: or.pred %p15, %p5, %p14; +; CHECK-NEXT: xor.b64 %rd67, %rd66, 127; +; CHECK-NEXT: or.b64 %rd68, %rd67, %rd8; +; CHECK-NEXT: setp.eq.s64 %p16, %rd68, 0; +; CHECK-NEXT: selp.b64 %rd125, 0, %rd4, %p15; +; CHECK-NEXT: selp.b64 %rd124, 0, %rd3, %p15; +; CHECK-NEXT: or.pred %p17, %p15, %p16; +; CHECK-NEXT: @%p17 bra $L__BB0_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd119, %rd66, 1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd67, 0; -; CHECK-NEXT: or.b64 %rd72, %rd119, %rd120; -; CHECK-NEXT: setp.eq.s64 %p16, %rd72, 0; +; CHECK-NEXT: add.cc.s64 %rd118, %rd66, 1; +; CHECK-NEXT: addc.cc.s64 %rd119, %rd8, 0; +; CHECK-NEXT: or.b64 %rd71, %rd118, %rd119; +; CHECK-NEXT: setp.eq.s64 %p18, %rd71, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd66; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd73, %rd4, %r6; +; CHECK-NEXT: shl.b64 %rd72, %rd4, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd74, %rd3, %r7; -; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; +; CHECK-NEXT: shr.u64 %rd73, %rd3, %r7; +; CHECK-NEXT: or.b64 %rd74, %rd72, %rd73; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd76, %rd3, %r8; -; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd124, %rd76, %rd75, %p17; -; CHECK-NEXT: shl.b64 %rd123, %rd3, %r6; -; CHECK-NEXT: mov.b64 %rd114, %rd117; -; CHECK-NEXT: @%p16 bra $L__BB0_4; +; CHECK-NEXT: shl.b64 %rd75, %rd3, %r8; +; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; +; CHECK-NEXT: selp.b64 %rd123, %rd75, %rd74, %p19; +; CHECK-NEXT: shl.b64 %rd122, %rd3, %r6; +; CHECK-NEXT: mov.b64 %rd113, %rd116; +; CHECK-NEXT: @%p18 bra $L__BB0_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd119; -; CHECK-NEXT: shr.u64 %rd79, %rd3, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd118; +; CHECK-NEXT: shr.u64 %rd78, %rd3, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd80, %rd4, %r10; -; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; +; CHECK-NEXT: shl.b64 %rd79, %rd4, %r10; +; CHECK-NEXT: or.b64 %rd80, %rd78, %rd79; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd82, %rd4, %r11; -; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd121, %rd82, %rd81, %p18; -; CHECK-NEXT: shr.u64 %rd122, %rd4, %r9; +; CHECK-NEXT: shr.u64 %rd81, %rd4, %r11; +; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; +; CHECK-NEXT: selp.b64 %rd120, %rd81, %rd80, %p20; +; CHECK-NEXT: shr.u64 %rd121, %rd4, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd5, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd6, -1; -; CHECK-NEXT: mov.b64 %rd114, 0; -; CHECK-NEXT: mov.b64 %rd117, %rd114; +; CHECK-NEXT: mov.b64 %rd113, 0; +; CHECK-NEXT: mov.b64 %rd116, %rd113; ; CHECK-NEXT: $L__BB0_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd83, %rd121, 63; -; CHECK-NEXT: shl.b64 %rd84, %rd122, 1; -; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; -; CHECK-NEXT: shl.b64 %rd86, %rd121, 1; -; CHECK-NEXT: shr.u64 %rd87, %rd124, 63; -; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; -; CHECK-NEXT: shr.u64 %rd89, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd124, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd123, 1; -; CHECK-NEXT: or.b64 %rd123, %rd117, %rd92; -; CHECK-NEXT: or.b64 %rd124, %rd114, %rd91; -; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; -; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; -; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; -; CHECK-NEXT: and.b64 %rd117, %rd95, 1; -; CHECK-NEXT: and.b64 %rd96, %rd95, %rd5; -; CHECK-NEXT: and.b64 %rd97, %rd95, %rd6; -; CHECK-NEXT: sub.cc.s64 %rd121, %rd88, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd122, %rd85, %rd97; -; CHECK-NEXT: add.cc.s64 %rd119, %rd119, -1; -; CHECK-NEXT: addc.cc.s64 %rd120, %rd120, -1; -; CHECK-NEXT: or.b64 %rd98, %rd119, %rd120; -; CHECK-NEXT: setp.eq.s64 %p19, %rd98, 0; -; CHECK-NEXT: @%p19 bra $L__BB0_4; +; CHECK-NEXT: shr.u64 %rd82, %rd120, 63; +; CHECK-NEXT: shl.b64 %rd83, %rd121, 1; +; CHECK-NEXT: or.b64 %rd84, %rd83, %rd82; +; CHECK-NEXT: shl.b64 %rd85, %rd120, 1; +; CHECK-NEXT: shr.u64 %rd86, %rd123, 63; +; CHECK-NEXT: or.b64 %rd87, %rd85, %rd86; +; CHECK-NEXT: shr.u64 %rd88, %rd122, 63; +; CHECK-NEXT: shl.b64 %rd89, %rd123, 1; +; CHECK-NEXT: or.b64 %rd90, %rd89, %rd88; +; CHECK-NEXT: shl.b64 %rd91, %rd122, 1; +; CHECK-NEXT: or.b64 %rd122, %rd116, %rd91; +; CHECK-NEXT: or.b64 %rd123, %rd113, %rd90; +; CHECK-NEXT: sub.cc.s64 %rd92, %rd35, %rd87; +; CHECK-NEXT: subc.cc.s64 %rd93, %rd36, %rd84; +; CHECK-NEXT: shr.s64 %rd94, %rd93, 63; +; CHECK-NEXT: and.b64 %rd116, %rd94, 1; +; CHECK-NEXT: and.b64 %rd95, %rd94, %rd5; +; CHECK-NEXT: and.b64 %rd96, %rd94, %rd6; +; CHECK-NEXT: sub.cc.s64 %rd120, %rd87, %rd95; +; CHECK-NEXT: subc.cc.s64 %rd121, %rd84, %rd96; +; CHECK-NEXT: add.cc.s64 %rd118, %rd118, -1; +; CHECK-NEXT: addc.cc.s64 %rd119, %rd119, -1; +; CHECK-NEXT: or.b64 %rd97, %rd118, %rd119; +; CHECK-NEXT: setp.eq.s64 %p21, %rd97, 0; +; CHECK-NEXT: @%p21 bra $L__BB0_4; ; CHECK-NEXT: bra.uni $L__BB0_2; ; CHECK-NEXT: $L__BB0_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd99, %rd123, 63; -; CHECK-NEXT: shl.b64 %rd100, %rd124, 1; -; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; -; CHECK-NEXT: shl.b64 %rd102, %rd123, 1; -; CHECK-NEXT: or.b64 %rd125, %rd117, %rd102; -; CHECK-NEXT: or.b64 %rd126, %rd114, %rd101; +; CHECK-NEXT: shr.u64 %rd98, %rd122, 63; +; CHECK-NEXT: shl.b64 %rd99, %rd123, 1; +; CHECK-NEXT: or.b64 %rd100, %rd99, %rd98; +; CHECK-NEXT: shl.b64 %rd101, %rd122, 1; +; CHECK-NEXT: or.b64 %rd124, %rd116, %rd101; +; CHECK-NEXT: or.b64 %rd125, %rd113, %rd100; ; CHECK-NEXT: $L__BB0_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd103, %rd5, %rd125; -; CHECK-NEXT: mad.lo.s64 %rd104, %rd5, %rd126, %rd103; -; CHECK-NEXT: mad.lo.s64 %rd105, %rd6, %rd125, %rd104; -; CHECK-NEXT: mul.lo.s64 %rd106, %rd5, %rd125; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd3, %rd106; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd4, %rd105; +; CHECK-NEXT: mul.hi.u64 %rd102, %rd5, %rd124; +; CHECK-NEXT: mad.lo.s64 %rd103, %rd5, %rd125, %rd102; +; CHECK-NEXT: mad.lo.s64 %rd104, %rd6, %rd124, %rd103; +; CHECK-NEXT: mul.lo.s64 %rd105, %rd5, %rd124; +; CHECK-NEXT: sub.cc.s64 %rd106, %rd3, %rd105; +; CHECK-NEXT: subc.cc.s64 %rd107, %rd4, %rd104; +; CHECK-NEXT: xor.b64 %rd108, %rd106, %rd2; ; CHECK-NEXT: xor.b64 %rd109, %rd107, %rd2; -; CHECK-NEXT: xor.b64 %rd110, %rd108, %rd2; -; CHECK-NEXT: sub.cc.s64 %rd111, %rd109, %rd2; -; CHECK-NEXT: subc.cc.s64 %rd112, %rd110, %rd2; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd111, %rd112}; +; CHECK-NEXT: sub.cc.s64 %rd110, %rd108, %rd2; +; CHECK-NEXT: subc.cc.s64 %rd111, %rd109, %rd2; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd110, %rd111}; ; CHECK-NEXT: ret; %div = srem i128 %lhs, %rhs ret i128 %div @@ -148,7 +149,7 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<113>; +; CHECK-NEXT: .reg .b64 %rd<111>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [urem_i128_param_0]; @@ -172,98 +173,98 @@ define i128 @urem_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd103, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd103, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.s64 %p7, %rd57, 0; +; CHECK-NEXT: mov.b64 %rd101, 0; +; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd6, %rd101, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; +; CHECK-NEXT: setp.eq.s64 %p7, %rd6, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.s64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.s64 %p9, %rd6, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.s64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd112, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd111, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; +; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; +; CHECK-NEXT: setp.eq.s64 %p12, %rd57, 0; +; CHECK-NEXT: selp.b64 %rd110, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd109, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB1_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd105, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd105, %rd106; -; CHECK-NEXT: setp.eq.s64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd103, %rd5, 1; +; CHECK-NEXT: addc.cc.s64 %rd104, %rd6, 0; +; CHECK-NEXT: or.b64 %rd60, %rd103, %rd104; +; CHECK-NEXT: setp.eq.s64 %p14, %rd60, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd110, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd109, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd100, %rd103; +; CHECK-NEXT: selp.b64 %rd108, %rd64, %rd63, %p15; +; CHECK-NEXT: shl.b64 %rd107, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd98, %rd101; ; CHECK-NEXT: @%p14 bra $L__BB1_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd105; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd103; +; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd107, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd108, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd105, %rd70, %rd69, %p16; +; CHECK-NEXT: shr.u64 %rd106, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd100, 0; -; CHECK-NEXT: mov.b64 %rd103, %rd100; +; CHECK-NEXT: mov.b64 %rd98, 0; +; CHECK-NEXT: mov.b64 %rd101, %rd98; ; CHECK-NEXT: $L__BB1_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd107, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd108, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd107, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd110, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd110, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd109, 1; -; CHECK-NEXT: or.b64 %rd109, %rd103, %rd82; -; CHECK-NEXT: or.b64 %rd110, %rd100, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd103, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd3; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd107, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd108, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd105, %rd105, -1; -; CHECK-NEXT: addc.cc.s64 %rd106, %rd106, -1; -; CHECK-NEXT: or.b64 %rd88, %rd105, %rd106; -; CHECK-NEXT: setp.eq.s64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd71, %rd105, 63; +; CHECK-NEXT: shl.b64 %rd72, %rd106, 1; +; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; +; CHECK-NEXT: shl.b64 %rd74, %rd105, 1; +; CHECK-NEXT: shr.u64 %rd75, %rd108, 63; +; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; +; CHECK-NEXT: shr.u64 %rd77, %rd107, 63; +; CHECK-NEXT: shl.b64 %rd78, %rd108, 1; +; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; +; CHECK-NEXT: shl.b64 %rd80, %rd107, 1; +; CHECK-NEXT: or.b64 %rd107, %rd101, %rd80; +; CHECK-NEXT: or.b64 %rd108, %rd98, %rd79; +; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; +; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; +; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; +; CHECK-NEXT: and.b64 %rd101, %rd83, 1; +; CHECK-NEXT: and.b64 %rd84, %rd83, %rd3; +; CHECK-NEXT: and.b64 %rd85, %rd83, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd105, %rd76, %rd84; +; CHECK-NEXT: subc.cc.s64 %rd106, %rd73, %rd85; +; CHECK-NEXT: add.cc.s64 %rd103, %rd103, -1; +; CHECK-NEXT: addc.cc.s64 %rd104, %rd104, -1; +; CHECK-NEXT: or.b64 %rd86, %rd103, %rd104; +; CHECK-NEXT: setp.eq.s64 %p17, %rd86, 0; ; CHECK-NEXT: @%p17 bra $L__BB1_4; ; CHECK-NEXT: bra.uni $L__BB1_2; ; CHECK-NEXT: $L__BB1_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd109, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd110, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd109, 1; -; CHECK-NEXT: or.b64 %rd111, %rd103, %rd92; -; CHECK-NEXT: or.b64 %rd112, %rd100, %rd91; +; CHECK-NEXT: shr.u64 %rd87, %rd107, 63; +; CHECK-NEXT: shl.b64 %rd88, %rd108, 1; +; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; +; CHECK-NEXT: shl.b64 %rd90, %rd107, 1; +; CHECK-NEXT: or.b64 %rd109, %rd101, %rd90; +; CHECK-NEXT: or.b64 %rd110, %rd98, %rd89; ; CHECK-NEXT: $L__BB1_5: // %udiv-end -; CHECK-NEXT: mul.hi.u64 %rd93, %rd3, %rd111; -; CHECK-NEXT: mad.lo.s64 %rd94, %rd3, %rd112, %rd93; -; CHECK-NEXT: mad.lo.s64 %rd95, %rd4, %rd111, %rd94; -; CHECK-NEXT: mul.lo.s64 %rd96, %rd3, %rd111; -; CHECK-NEXT: sub.cc.s64 %rd97, %rd41, %rd96; -; CHECK-NEXT: subc.cc.s64 %rd98, %rd42, %rd95; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd97, %rd98}; +; CHECK-NEXT: mul.hi.u64 %rd91, %rd3, %rd109; +; CHECK-NEXT: mad.lo.s64 %rd92, %rd3, %rd110, %rd91; +; CHECK-NEXT: mad.lo.s64 %rd93, %rd4, %rd109, %rd92; +; CHECK-NEXT: mul.lo.s64 %rd94, %rd3, %rd109; +; CHECK-NEXT: sub.cc.s64 %rd95, %rd41, %rd94; +; CHECK-NEXT: subc.cc.s64 %rd96, %rd42, %rd93; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd95, %rd96}; ; CHECK-NEXT: ret; %div = urem i128 %lhs, %rhs ret i128 %div @@ -306,9 +307,9 @@ define i128 @urem_i128_pow2k(i128 %lhs) { define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-LABEL: sdiv_i128( ; CHECK: { -; CHECK-NEXT: .reg .pred %p<20>; +; CHECK-NEXT: .reg .pred %p<22>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<122>; +; CHECK-NEXT: .reg .b64 %rd<121>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd45, %rd46}, [sdiv_i128_param_0]; @@ -344,96 +345,97 @@ define i128 @sdiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd63, %r4; ; CHECK-NEXT: add.s64 %rd64, %rd63, 64; ; CHECK-NEXT: selp.b64 %rd65, %rd62, %rd64, %p7; -; CHECK-NEXT: mov.b64 %rd112, 0; +; CHECK-NEXT: mov.b64 %rd111, 0; ; CHECK-NEXT: sub.cc.s64 %rd67, %rd61, %rd65; -; CHECK-NEXT: subc.cc.s64 %rd68, %rd112, 0; -; CHECK-NEXT: setp.gt.u64 %p8, %rd67, 127; -; CHECK-NEXT: setp.eq.s64 %p9, %rd68, 0; -; CHECK-NEXT: and.pred %p10, %p9, %p8; -; CHECK-NEXT: setp.ne.s64 %p11, %rd68, 0; -; CHECK-NEXT: or.pred %p12, %p10, %p11; -; CHECK-NEXT: or.pred %p13, %p5, %p12; -; CHECK-NEXT: xor.b64 %rd69, %rd67, 127; -; CHECK-NEXT: or.b64 %rd70, %rd69, %rd68; -; CHECK-NEXT: setp.eq.s64 %p14, %rd70, 0; -; CHECK-NEXT: selp.b64 %rd121, 0, %rd2, %p13; -; CHECK-NEXT: selp.b64 %rd120, 0, %rd1, %p13; -; CHECK-NEXT: or.pred %p15, %p13, %p14; -; CHECK-NEXT: @%p15 bra $L__BB4_5; +; CHECK-NEXT: subc.cc.s64 %rd8, %rd111, 0; +; CHECK-NEXT: setp.ne.s64 %p8, %rd8, 0; +; CHECK-NEXT: and.pred %p10, %p8, %p8; +; CHECK-NEXT: setp.eq.s64 %p11, %rd8, 0; +; CHECK-NEXT: setp.gt.u64 %p12, %rd67, 127; +; CHECK-NEXT: and.pred %p13, %p11, %p12; +; CHECK-NEXT: or.pred %p14, %p13, %p10; +; CHECK-NEXT: or.pred %p15, %p5, %p14; +; CHECK-NEXT: xor.b64 %rd68, %rd67, 127; +; CHECK-NEXT: or.b64 %rd69, %rd68, %rd8; +; CHECK-NEXT: setp.eq.s64 %p16, %rd69, 0; +; CHECK-NEXT: selp.b64 %rd120, 0, %rd2, %p15; +; CHECK-NEXT: selp.b64 %rd119, 0, %rd1, %p15; +; CHECK-NEXT: or.pred %p17, %p15, %p16; +; CHECK-NEXT: @%p17 bra $L__BB4_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd114, %rd67, 1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd68, 0; -; CHECK-NEXT: or.b64 %rd73, %rd114, %rd115; -; CHECK-NEXT: setp.eq.s64 %p16, %rd73, 0; +; CHECK-NEXT: add.cc.s64 %rd113, %rd67, 1; +; CHECK-NEXT: addc.cc.s64 %rd114, %rd8, 0; +; CHECK-NEXT: or.b64 %rd72, %rd113, %rd114; +; CHECK-NEXT: setp.eq.s64 %p18, %rd72, 0; ; CHECK-NEXT: cvt.u32.u64 %r5, %rd67; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd74, %rd2, %r6; +; CHECK-NEXT: shl.b64 %rd73, %rd2, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd75, %rd1, %r7; -; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; +; CHECK-NEXT: shr.u64 %rd74, %rd1, %r7; +; CHECK-NEXT: or.b64 %rd75, %rd73, %rd74; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd77, %rd1, %r8; -; CHECK-NEXT: setp.gt.s32 %p17, %r6, 63; -; CHECK-NEXT: selp.b64 %rd119, %rd77, %rd76, %p17; -; CHECK-NEXT: shl.b64 %rd118, %rd1, %r6; -; CHECK-NEXT: mov.b64 %rd109, %rd112; -; CHECK-NEXT: @%p16 bra $L__BB4_4; +; CHECK-NEXT: shl.b64 %rd76, %rd1, %r8; +; CHECK-NEXT: setp.gt.s32 %p19, %r6, 63; +; CHECK-NEXT: selp.b64 %rd118, %rd76, %rd75, %p19; +; CHECK-NEXT: shl.b64 %rd117, %rd1, %r6; +; CHECK-NEXT: mov.b64 %rd108, %rd111; +; CHECK-NEXT: @%p18 bra $L__BB4_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd114; -; CHECK-NEXT: shr.u64 %rd80, %rd1, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd113; +; CHECK-NEXT: shr.u64 %rd79, %rd1, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd81, %rd2, %r10; -; CHECK-NEXT: or.b64 %rd82, %rd80, %rd81; +; CHECK-NEXT: shl.b64 %rd80, %rd2, %r10; +; CHECK-NEXT: or.b64 %rd81, %rd79, %rd80; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd83, %rd2, %r11; -; CHECK-NEXT: setp.gt.s32 %p18, %r9, 63; -; CHECK-NEXT: selp.b64 %rd116, %rd83, %rd82, %p18; -; CHECK-NEXT: shr.u64 %rd117, %rd2, %r9; +; CHECK-NEXT: shr.u64 %rd82, %rd2, %r11; +; CHECK-NEXT: setp.gt.s32 %p20, %r9, 63; +; CHECK-NEXT: selp.b64 %rd115, %rd82, %rd81, %p20; +; CHECK-NEXT: shr.u64 %rd116, %rd2, %r9; ; CHECK-NEXT: add.cc.s64 %rd35, %rd3, -1; ; CHECK-NEXT: addc.cc.s64 %rd36, %rd4, -1; -; CHECK-NEXT: mov.b64 %rd109, 0; -; CHECK-NEXT: mov.b64 %rd112, %rd109; +; CHECK-NEXT: mov.b64 %rd108, 0; +; CHECK-NEXT: mov.b64 %rd111, %rd108; ; CHECK-NEXT: $L__BB4_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd84, %rd116, 63; -; CHECK-NEXT: shl.b64 %rd85, %rd117, 1; -; CHECK-NEXT: or.b64 %rd86, %rd85, %rd84; -; CHECK-NEXT: shl.b64 %rd87, %rd116, 1; -; CHECK-NEXT: shr.u64 %rd88, %rd119, 63; -; CHECK-NEXT: or.b64 %rd89, %rd87, %rd88; -; CHECK-NEXT: shr.u64 %rd90, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd91, %rd119, 1; -; CHECK-NEXT: or.b64 %rd92, %rd91, %rd90; -; CHECK-NEXT: shl.b64 %rd93, %rd118, 1; -; CHECK-NEXT: or.b64 %rd118, %rd112, %rd93; -; CHECK-NEXT: or.b64 %rd119, %rd109, %rd92; -; CHECK-NEXT: sub.cc.s64 %rd94, %rd35, %rd89; -; CHECK-NEXT: subc.cc.s64 %rd95, %rd36, %rd86; -; CHECK-NEXT: shr.s64 %rd96, %rd95, 63; -; CHECK-NEXT: and.b64 %rd112, %rd96, 1; -; CHECK-NEXT: and.b64 %rd97, %rd96, %rd3; -; CHECK-NEXT: and.b64 %rd98, %rd96, %rd4; -; CHECK-NEXT: sub.cc.s64 %rd116, %rd89, %rd97; -; CHECK-NEXT: subc.cc.s64 %rd117, %rd86, %rd98; -; CHECK-NEXT: add.cc.s64 %rd114, %rd114, -1; -; CHECK-NEXT: addc.cc.s64 %rd115, %rd115, -1; -; CHECK-NEXT: or.b64 %rd99, %rd114, %rd115; -; CHECK-NEXT: setp.eq.s64 %p19, %rd99, 0; -; CHECK-NEXT: @%p19 bra $L__BB4_4; +; CHECK-NEXT: shr.u64 %rd83, %rd115, 63; +; CHECK-NEXT: shl.b64 %rd84, %rd116, 1; +; CHECK-NEXT: or.b64 %rd85, %rd84, %rd83; +; CHECK-NEXT: shl.b64 %rd86, %rd115, 1; +; CHECK-NEXT: shr.u64 %rd87, %rd118, 63; +; CHECK-NEXT: or.b64 %rd88, %rd86, %rd87; +; CHECK-NEXT: shr.u64 %rd89, %rd117, 63; +; CHECK-NEXT: shl.b64 %rd90, %rd118, 1; +; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; +; CHECK-NEXT: shl.b64 %rd92, %rd117, 1; +; CHECK-NEXT: or.b64 %rd117, %rd111, %rd92; +; CHECK-NEXT: or.b64 %rd118, %rd108, %rd91; +; CHECK-NEXT: sub.cc.s64 %rd93, %rd35, %rd88; +; CHECK-NEXT: subc.cc.s64 %rd94, %rd36, %rd85; +; CHECK-NEXT: shr.s64 %rd95, %rd94, 63; +; CHECK-NEXT: and.b64 %rd111, %rd95, 1; +; CHECK-NEXT: and.b64 %rd96, %rd95, %rd3; +; CHECK-NEXT: and.b64 %rd97, %rd95, %rd4; +; CHECK-NEXT: sub.cc.s64 %rd115, %rd88, %rd96; +; CHECK-NEXT: subc.cc.s64 %rd116, %rd85, %rd97; +; CHECK-NEXT: add.cc.s64 %rd113, %rd113, -1; +; CHECK-NEXT: addc.cc.s64 %rd114, %rd114, -1; +; CHECK-NEXT: or.b64 %rd98, %rd113, %rd114; +; CHECK-NEXT: setp.eq.s64 %p21, %rd98, 0; +; CHECK-NEXT: @%p21 bra $L__BB4_4; ; CHECK-NEXT: bra.uni $L__BB4_2; ; CHECK-NEXT: $L__BB4_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd100, %rd118, 63; -; CHECK-NEXT: shl.b64 %rd101, %rd119, 1; -; CHECK-NEXT: or.b64 %rd102, %rd101, %rd100; -; CHECK-NEXT: shl.b64 %rd103, %rd118, 1; -; CHECK-NEXT: or.b64 %rd120, %rd112, %rd103; -; CHECK-NEXT: or.b64 %rd121, %rd109, %rd102; +; CHECK-NEXT: shr.u64 %rd99, %rd117, 63; +; CHECK-NEXT: shl.b64 %rd100, %rd118, 1; +; CHECK-NEXT: or.b64 %rd101, %rd100, %rd99; +; CHECK-NEXT: shl.b64 %rd102, %rd117, 1; +; CHECK-NEXT: or.b64 %rd119, %rd111, %rd102; +; CHECK-NEXT: or.b64 %rd120, %rd108, %rd101; ; CHECK-NEXT: $L__BB4_5: // %udiv-end +; CHECK-NEXT: xor.b64 %rd103, %rd119, %rd5; ; CHECK-NEXT: xor.b64 %rd104, %rd120, %rd5; -; CHECK-NEXT: xor.b64 %rd105, %rd121, %rd5; -; CHECK-NEXT: sub.cc.s64 %rd106, %rd104, %rd5; -; CHECK-NEXT: subc.cc.s64 %rd107, %rd105, %rd5; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd106, %rd107}; +; CHECK-NEXT: sub.cc.s64 %rd105, %rd103, %rd5; +; CHECK-NEXT: subc.cc.s64 %rd106, %rd104, %rd5; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; ; CHECK-NEXT: ret; %div = sdiv i128 %lhs, %rhs ret i128 %div @@ -444,7 +446,7 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK: { ; CHECK-NEXT: .reg .pred %p<18>; ; CHECK-NEXT: .reg .b32 %r<12>; -; CHECK-NEXT: .reg .b64 %rd<107>; +; CHECK-NEXT: .reg .b64 %rd<105>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: // %_udiv-special-cases ; CHECK-NEXT: ld.param.v2.b64 {%rd41, %rd42}, [udiv_i128_param_0]; @@ -468,92 +470,92 @@ define i128 @udiv_i128(i128 %lhs, i128 %rhs) { ; CHECK-NEXT: cvt.u64.u32 %rd52, %r4; ; CHECK-NEXT: add.s64 %rd53, %rd52, 64; ; CHECK-NEXT: selp.b64 %rd54, %rd51, %rd53, %p5; -; CHECK-NEXT: mov.b64 %rd97, 0; -; CHECK-NEXT: sub.cc.s64 %rd56, %rd50, %rd54; -; CHECK-NEXT: subc.cc.s64 %rd57, %rd97, 0; -; CHECK-NEXT: setp.gt.u64 %p6, %rd56, 127; -; CHECK-NEXT: setp.eq.s64 %p7, %rd57, 0; +; CHECK-NEXT: mov.b64 %rd95, 0; +; CHECK-NEXT: sub.cc.s64 %rd5, %rd50, %rd54; +; CHECK-NEXT: subc.cc.s64 %rd6, %rd95, 0; +; CHECK-NEXT: setp.gt.u64 %p6, %rd5, 127; +; CHECK-NEXT: setp.eq.s64 %p7, %rd6, 0; ; CHECK-NEXT: and.pred %p8, %p7, %p6; -; CHECK-NEXT: setp.ne.s64 %p9, %rd57, 0; +; CHECK-NEXT: setp.ne.s64 %p9, %rd6, 0; ; CHECK-NEXT: or.pred %p10, %p8, %p9; ; CHECK-NEXT: or.pred %p11, %p3, %p10; -; CHECK-NEXT: xor.b64 %rd58, %rd56, 127; -; CHECK-NEXT: or.b64 %rd59, %rd58, %rd57; -; CHECK-NEXT: setp.eq.s64 %p12, %rd59, 0; -; CHECK-NEXT: selp.b64 %rd106, 0, %rd42, %p11; -; CHECK-NEXT: selp.b64 %rd105, 0, %rd41, %p11; +; CHECK-NEXT: xor.b64 %rd56, %rd5, 127; +; CHECK-NEXT: or.b64 %rd57, %rd56, %rd6; +; CHECK-NEXT: setp.eq.s64 %p12, %rd57, 0; +; CHECK-NEXT: selp.b64 %rd104, 0, %rd42, %p11; +; CHECK-NEXT: selp.b64 %rd103, 0, %rd41, %p11; ; CHECK-NEXT: or.pred %p13, %p11, %p12; ; CHECK-NEXT: @%p13 bra $L__BB5_5; ; CHECK-NEXT: // %bb.3: // %udiv-bb1 -; CHECK-NEXT: add.cc.s64 %rd99, %rd56, 1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd57, 0; -; CHECK-NEXT: or.b64 %rd62, %rd99, %rd100; -; CHECK-NEXT: setp.eq.s64 %p14, %rd62, 0; -; CHECK-NEXT: cvt.u32.u64 %r5, %rd56; +; CHECK-NEXT: add.cc.s64 %rd97, %rd5, 1; +; CHECK-NEXT: addc.cc.s64 %rd98, %rd6, 0; +; CHECK-NEXT: or.b64 %rd60, %rd97, %rd98; +; CHECK-NEXT: setp.eq.s64 %p14, %rd60, 0; +; CHECK-NEXT: cvt.u32.u64 %r5, %rd5; ; CHECK-NEXT: sub.s32 %r6, 127, %r5; -; CHECK-NEXT: shl.b64 %rd63, %rd42, %r6; +; CHECK-NEXT: shl.b64 %rd61, %rd42, %r6; ; CHECK-NEXT: sub.s32 %r7, 64, %r6; -; CHECK-NEXT: shr.u64 %rd64, %rd41, %r7; -; CHECK-NEXT: or.b64 %rd65, %rd63, %rd64; +; CHECK-NEXT: shr.u64 %rd62, %rd41, %r7; +; CHECK-NEXT: or.b64 %rd63, %rd61, %rd62; ; CHECK-NEXT: sub.s32 %r8, 63, %r5; -; CHECK-NEXT: shl.b64 %rd66, %rd41, %r8; +; CHECK-NEXT: shl.b64 %rd64, %rd41, %r8; ; CHECK-NEXT: setp.gt.s32 %p15, %r6, 63; -; CHECK-NEXT: selp.b64 %rd104, %rd66, %rd65, %p15; -; CHECK-NEXT: shl.b64 %rd103, %rd41, %r6; -; CHECK-NEXT: mov.b64 %rd94, %rd97; +; CHECK-NEXT: selp.b64 %rd102, %rd64, %rd63, %p15; +; CHECK-NEXT: shl.b64 %rd101, %rd41, %r6; +; CHECK-NEXT: mov.b64 %rd92, %rd95; ; CHECK-NEXT: @%p14 bra $L__BB5_4; ; CHECK-NEXT: // %bb.1: // %udiv-preheader -; CHECK-NEXT: cvt.u32.u64 %r9, %rd99; -; CHECK-NEXT: shr.u64 %rd69, %rd41, %r9; +; CHECK-NEXT: cvt.u32.u64 %r9, %rd97; +; CHECK-NEXT: shr.u64 %rd67, %rd41, %r9; ; CHECK-NEXT: sub.s32 %r10, 64, %r9; -; CHECK-NEXT: shl.b64 %rd70, %rd42, %r10; -; CHECK-NEXT: or.b64 %rd71, %rd69, %rd70; +; CHECK-NEXT: shl.b64 %rd68, %rd42, %r10; +; CHECK-NEXT: or.b64 %rd69, %rd67, %rd68; ; CHECK-NEXT: add.s32 %r11, %r9, -64; -; CHECK-NEXT: shr.u64 %rd72, %rd42, %r11; +; CHECK-NEXT: shr.u64 %rd70, %rd42, %r11; ; CHECK-NEXT: setp.gt.s32 %p16, %r9, 63; -; CHECK-NEXT: selp.b64 %rd101, %rd72, %rd71, %p16; -; CHECK-NEXT: shr.u64 %rd102, %rd42, %r9; +; CHECK-NEXT: selp.b64 %rd99, %rd70, %rd69, %p16; +; CHECK-NEXT: shr.u64 %rd100, %rd42, %r9; ; CHECK-NEXT: add.cc.s64 %rd33, %rd43, -1; ; CHECK-NEXT: addc.cc.s64 %rd34, %rd44, -1; -; CHECK-NEXT: mov.b64 %rd94, 0; -; CHECK-NEXT: mov.b64 %rd97, %rd94; +; CHECK-NEXT: mov.b64 %rd92, 0; +; CHECK-NEXT: mov.b64 %rd95, %rd92; ; CHECK-NEXT: $L__BB5_2: // %udiv-do-while ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: shr.u64 %rd73, %rd101, 63; -; CHECK-NEXT: shl.b64 %rd74, %rd102, 1; -; CHECK-NEXT: or.b64 %rd75, %rd74, %rd73; -; CHECK-NEXT: shl.b64 %rd76, %rd101, 1; -; CHECK-NEXT: shr.u64 %rd77, %rd104, 63; -; CHECK-NEXT: or.b64 %rd78, %rd76, %rd77; -; CHECK-NEXT: shr.u64 %rd79, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd80, %rd104, 1; -; CHECK-NEXT: or.b64 %rd81, %rd80, %rd79; -; CHECK-NEXT: shl.b64 %rd82, %rd103, 1; -; CHECK-NEXT: or.b64 %rd103, %rd97, %rd82; -; CHECK-NEXT: or.b64 %rd104, %rd94, %rd81; -; CHECK-NEXT: sub.cc.s64 %rd83, %rd33, %rd78; -; CHECK-NEXT: subc.cc.s64 %rd84, %rd34, %rd75; -; CHECK-NEXT: shr.s64 %rd85, %rd84, 63; -; CHECK-NEXT: and.b64 %rd97, %rd85, 1; -; CHECK-NEXT: and.b64 %rd86, %rd85, %rd43; -; CHECK-NEXT: and.b64 %rd87, %rd85, %rd44; -; CHECK-NEXT: sub.cc.s64 %rd101, %rd78, %rd86; -; CHECK-NEXT: subc.cc.s64 %rd102, %rd75, %rd87; -; CHECK-NEXT: add.cc.s64 %rd99, %rd99, -1; -; CHECK-NEXT: addc.cc.s64 %rd100, %rd100, -1; -; CHECK-NEXT: or.b64 %rd88, %rd99, %rd100; -; CHECK-NEXT: setp.eq.s64 %p17, %rd88, 0; +; CHECK-NEXT: shr.u64 %rd71, %rd99, 63; +; CHECK-NEXT: shl.b64 %rd72, %rd100, 1; +; CHECK-NEXT: or.b64 %rd73, %rd72, %rd71; +; CHECK-NEXT: shl.b64 %rd74, %rd99, 1; +; CHECK-NEXT: shr.u64 %rd75, %rd102, 63; +; CHECK-NEXT: or.b64 %rd76, %rd74, %rd75; +; CHECK-NEXT: shr.u64 %rd77, %rd101, 63; +; CHECK-NEXT: shl.b64 %rd78, %rd102, 1; +; CHECK-NEXT: or.b64 %rd79, %rd78, %rd77; +; CHECK-NEXT: shl.b64 %rd80, %rd101, 1; +; CHECK-NEXT: or.b64 %rd101, %rd95, %rd80; +; CHECK-NEXT: or.b64 %rd102, %rd92, %rd79; +; CHECK-NEXT: sub.cc.s64 %rd81, %rd33, %rd76; +; CHECK-NEXT: subc.cc.s64 %rd82, %rd34, %rd73; +; CHECK-NEXT: shr.s64 %rd83, %rd82, 63; +; CHECK-NEXT: and.b64 %rd95, %rd83, 1; +; CHECK-NEXT: and.b64 %rd84, %rd83, %rd43; +; CHECK-NEXT: and.b64 %rd85, %rd83, %rd44; +; CHECK-NEXT: sub.cc.s64 %rd99, %rd76, %rd84; +; CHECK-NEXT: subc.cc.s64 %rd100, %rd73, %rd85; +; CHECK-NEXT: add.cc.s64 %rd97, %rd97, -1; +; CHECK-NEXT: addc.cc.s64 %rd98, %rd98, -1; +; CHECK-NEXT: or.b64 %rd86, %rd97, %rd98; +; CHECK-NEXT: setp.eq.s64 %p17, %rd86, 0; ; CHECK-NEXT: @%p17 bra $L__BB5_4; ; CHECK-NEXT: bra.uni $L__BB5_2; ; CHECK-NEXT: $L__BB5_4: // %udiv-loop-exit -; CHECK-NEXT: shr.u64 %rd89, %rd103, 63; -; CHECK-NEXT: shl.b64 %rd90, %rd104, 1; -; CHECK-NEXT: or.b64 %rd91, %rd90, %rd89; -; CHECK-NEXT: shl.b64 %rd92, %rd103, 1; -; CHECK-NEXT: or.b64 %rd105, %rd97, %rd92; -; CHECK-NEXT: or.b64 %rd106, %rd94, %rd91; +; CHECK-NEXT: shr.u64 %rd87, %rd101, 63; +; CHECK-NEXT: shl.b64 %rd88, %rd102, 1; +; CHECK-NEXT: or.b64 %rd89, %rd88, %rd87; +; CHECK-NEXT: shl.b64 %rd90, %rd101, 1; +; CHECK-NEXT: or.b64 %rd103, %rd95, %rd90; +; CHECK-NEXT: or.b64 %rd104, %rd92, %rd89; ; CHECK-NEXT: $L__BB5_5: // %udiv-end -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd105, %rd106}; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd103, %rd104}; ; CHECK-NEXT: ret; %div = udiv i128 %lhs, %rhs ret i128 %div diff --git a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll index b540948b20f75..821cfd00dcd07 100644 --- a/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll +++ b/llvm/test/CodeGen/PowerPC/ppc64-P9-vabsd.ll @@ -764,13 +764,8 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; ; CHECK-PWR7-LABEL: sub_absv_8_ext: ; CHECK-PWR7: # %bb.0: # %entry -; CHECK-PWR7-NEXT: stdu r1, -512(r1) -; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 512 -; CHECK-PWR7-NEXT: .cfi_offset r14, -144 -; CHECK-PWR7-NEXT: .cfi_offset r15, -136 -; CHECK-PWR7-NEXT: .cfi_offset r16, -128 -; CHECK-PWR7-NEXT: .cfi_offset r17, -120 -; CHECK-PWR7-NEXT: .cfi_offset r18, -112 +; CHECK-PWR7-NEXT: stdu r1, -448(r1) +; CHECK-PWR7-NEXT: .cfi_def_cfa_offset 448 ; CHECK-PWR7-NEXT: .cfi_offset r19, -104 ; CHECK-PWR7-NEXT: .cfi_offset r20, -96 ; CHECK-PWR7-NEXT: .cfi_offset r21, -88 @@ -783,244 +778,258 @@ define <16 x i8> @sub_absv_8_ext(<16 x i8> %a, <16 x i8> %b) local_unnamed_addr ; CHECK-PWR7-NEXT: .cfi_offset r28, -32 ; CHECK-PWR7-NEXT: .cfi_offset r29, -24 ; CHECK-PWR7-NEXT: .cfi_offset r30, -16 -; CHECK-PWR7-NEXT: .cfi_offset r31, -8 -; CHECK-PWR7-NEXT: .cfi_offset r2, -152 -; CHECK-PWR7-NEXT: addi r3, r1, 320 -; CHECK-PWR7-NEXT: std r14, 368(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r15, 376(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r16, 384(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r17, 392(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r18, 400(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r19, 408(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r20, 416(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r21, 424(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r22, 432(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r23, 440(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r24, 448(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r25, 456(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r26, 464(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r27, 472(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r28, 480(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r29, 488(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r30, 496(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r31, 504(r1) # 8-byte Folded Spill -; CHECK-PWR7-NEXT: std r2, 360(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: addi r3, r1, 304 +; CHECK-PWR7-NEXT: std r19, 344(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r20, 352(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r21, 360(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r22, 368(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r23, 376(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r24, 384(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r25, 392(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r26, 400(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r27, 408(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r28, 416(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r29, 424(r1) # 8-byte Folded Spill +; CHECK-PWR7-NEXT: std r30, 432(r1) # 8-byte Folded Spill ; CHECK-PWR7-NEXT: stxvw4x v2, 0, r3 -; CHECK-PWR7-NEXT: lbz r3, 320(r1) -; CHECK-PWR7-NEXT: addi r4, r1, 336 -; CHECK-PWR7-NEXT: stw r3, 60(r1) # 4-byte Folded Spill -; CHECK-PWR7-NEXT: stxvw4x v3, 0, r4 -; CHECK-PWR7-NEXT: lbz r15, 334(r1) -; CHECK-PWR7-NEXT: lbz r14, 350(r1) -; CHECK-PWR7-NEXT: lbz r31, 335(r1) -; CHECK-PWR7-NEXT: lbz r2, 351(r1) -; CHECK-PWR7-NEXT: sub r15, r15, r14 -; CHECK-PWR7-NEXT: sub r14, r31, r2 -; CHECK-PWR7-NEXT: srawi r2, r14, 31 -; CHECK-PWR7-NEXT: xor r14, r14, r2 -; CHECK-PWR7-NEXT: lbz r3, 333(r1) -; CHECK-PWR7-NEXT: lbz r19, 331(r1) -; CHECK-PWR7-NEXT: lbz r18, 347(r1) -; CHECK-PWR7-NEXT: sub r19, r19, r18 -; CHECK-PWR7-NEXT: lbz r17, 332(r1) -; CHECK-PWR7-NEXT: lbz r16, 348(r1) -; CHECK-PWR7-NEXT: sub r17, r17, r16 -; CHECK-PWR7-NEXT: lbz r23, 329(r1) -; CHECK-PWR7-NEXT: sub r14, r14, r2 -; CHECK-PWR7-NEXT: lbz r2, 349(r1) -; CHECK-PWR7-NEXT: lbz r22, 345(r1) -; CHECK-PWR7-NEXT: lbz r4, 336(r1) -; CHECK-PWR7-NEXT: lbz r5, 321(r1) -; CHECK-PWR7-NEXT: lbz r6, 337(r1) -; CHECK-PWR7-NEXT: lbz r7, 322(r1) -; CHECK-PWR7-NEXT: lbz r8, 338(r1) -; CHECK-PWR7-NEXT: lbz r9, 323(r1) -; CHECK-PWR7-NEXT: lbz r10, 339(r1) -; CHECK-PWR7-NEXT: lbz r11, 324(r1) -; CHECK-PWR7-NEXT: lbz r12, 340(r1) -; CHECK-PWR7-NEXT: lbz r0, 325(r1) -; CHECK-PWR7-NEXT: lbz r30, 341(r1) -; CHECK-PWR7-NEXT: lbz r29, 326(r1) -; CHECK-PWR7-NEXT: lbz r28, 342(r1) -; CHECK-PWR7-NEXT: lbz r27, 327(r1) -; CHECK-PWR7-NEXT: lbz r26, 343(r1) -; CHECK-PWR7-NEXT: sub r3, r3, r2 -; CHECK-PWR7-NEXT: lbz r25, 328(r1) -; CHECK-PWR7-NEXT: lbz r24, 344(r1) -; CHECK-PWR7-NEXT: lbz r21, 330(r1) -; CHECK-PWR7-NEXT: lbz r20, 346(r1) +; CHECK-PWR7-NEXT: addi r3, r1, 320 +; CHECK-PWR7-NEXT: lbz r7, 304(r1) +; CHECK-PWR7-NEXT: stxvw4x v3, 0, r3 +; CHECK-PWR7-NEXT: lbz r8, 320(r1) +; CHECK-PWR7-NEXT: lbz r9, 305(r1) +; CHECK-PWR7-NEXT: lbz r10, 321(r1) +; CHECK-PWR7-NEXT: lbz r26, 325(r1) +; CHECK-PWR7-NEXT: clrlwi r7, r7, 24 +; CHECK-PWR7-NEXT: clrlwi r8, r8, 24 +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: clrlwi r10, r10, 24 +; CHECK-PWR7-NEXT: lbz r11, 306(r1) +; CHECK-PWR7-NEXT: lbz r12, 322(r1) +; CHECK-PWR7-NEXT: lbz r23, 314(r1) +; CHECK-PWR7-NEXT: clrlwi r22, r26, 24 +; CHECK-PWR7-NEXT: lbz r26, 330(r1) +; CHECK-PWR7-NEXT: sub r8, r7, r8 +; CHECK-PWR7-NEXT: lbz r7, 315(r1) +; CHECK-PWR7-NEXT: sub r20, r9, r10 +; CHECK-PWR7-NEXT: lbz r9, 331(r1) +; CHECK-PWR7-NEXT: lbz r0, 307(r1) +; CHECK-PWR7-NEXT: lbz r30, 323(r1) +; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR7-NEXT: clrlwi r12, r12, 24 +; CHECK-PWR7-NEXT: clrlwi r23, r23, 24 +; CHECK-PWR7-NEXT: clrlwi r21, r26, 24 +; CHECK-PWR7-NEXT: clrlwi r7, r7, 24 +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: clrlwi r0, r0, 24 +; CHECK-PWR7-NEXT: clrlwi r30, r30, 24 +; CHECK-PWR7-NEXT: lbz r29, 308(r1) +; CHECK-PWR7-NEXT: lbz r28, 324(r1) +; CHECK-PWR7-NEXT: lbz r27, 309(r1) +; CHECK-PWR7-NEXT: lbz r25, 310(r1) +; CHECK-PWR7-NEXT: lbz r24, 326(r1) +; CHECK-PWR7-NEXT: sub r19, r11, r12 +; CHECK-PWR7-NEXT: sub r11, r23, r21 +; CHECK-PWR7-NEXT: sub r9, r7, r9 +; CHECK-PWR7-NEXT: sub r26, r0, r30 +; CHECK-PWR7-NEXT: srawi r12, r11, 31 +; CHECK-PWR7-NEXT: srawi r0, r9, 31 +; CHECK-PWR7-NEXT: lbz r3, 312(r1) +; CHECK-PWR7-NEXT: clrlwi r29, r29, 24 +; CHECK-PWR7-NEXT: clrlwi r28, r28, 24 +; CHECK-PWR7-NEXT: clrlwi r27, r27, 24 +; CHECK-PWR7-NEXT: clrlwi r25, r25, 24 +; CHECK-PWR7-NEXT: clrlwi r24, r24, 24 +; CHECK-PWR7-NEXT: xor r11, r11, r12 +; CHECK-PWR7-NEXT: xor r9, r9, r0 +; CHECK-PWR7-NEXT: sub r28, r29, r28 +; CHECK-PWR7-NEXT: sub r30, r27, r22 +; CHECK-PWR7-NEXT: sub r29, r25, r24 +; CHECK-PWR7-NEXT: sub r27, r11, r12 +; CHECK-PWR7-NEXT: sub r24, r9, r0 +; CHECK-PWR7-NEXT: lbz r9, 316(r1) +; CHECK-PWR7-NEXT: lbz r11, 332(r1) +; CHECK-PWR7-NEXT: lbz r4, 328(r1) +; CHECK-PWR7-NEXT: lbz r5, 311(r1) +; CHECK-PWR7-NEXT: lbz r6, 327(r1) +; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR7-NEXT: clrlwi r3, r3, 24 +; CHECK-PWR7-NEXT: clrlwi r4, r4, 24 +; CHECK-PWR7-NEXT: clrlwi r5, r5, 24 +; CHECK-PWR7-NEXT: clrlwi r6, r6, 24 +; CHECK-PWR7-NEXT: sub r3, r3, r4 ; CHECK-PWR7-NEXT: sub r5, r5, r6 -; CHECK-PWR7-NEXT: srawi r18, r3, 31 -; CHECK-PWR7-NEXT: sub r7, r7, r8 -; CHECK-PWR7-NEXT: sub r9, r9, r10 -; CHECK-PWR7-NEXT: sub r11, r11, r12 -; CHECK-PWR7-NEXT: sub r0, r0, r30 -; CHECK-PWR7-NEXT: sub r29, r29, r28 -; CHECK-PWR7-NEXT: sub r27, r27, r26 -; CHECK-PWR7-NEXT: sub r25, r25, r24 -; CHECK-PWR7-NEXT: srawi r31, r15, 31 -; CHECK-PWR7-NEXT: ld r2, 360(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: xor r3, r3, r18 +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: srawi r4, r3, 31 ; CHECK-PWR7-NEXT: srawi r6, r5, 31 -; CHECK-PWR7-NEXT: srawi r8, r7, 31 -; CHECK-PWR7-NEXT: srawi r10, r9, 31 -; CHECK-PWR7-NEXT: srawi r12, r11, 31 -; CHECK-PWR7-NEXT: srawi r30, r0, 31 -; CHECK-PWR7-NEXT: sub r3, r3, r18 -; CHECK-PWR7-NEXT: srawi r18, r19, 31 -; CHECK-PWR7-NEXT: srawi r28, r29, 31 -; CHECK-PWR7-NEXT: ld r16, 384(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: srawi r26, r27, 31 -; CHECK-PWR7-NEXT: srawi r24, r25, 31 -; CHECK-PWR7-NEXT: xor r19, r19, r18 -; CHECK-PWR7-NEXT: xor r15, r15, r31 +; CHECK-PWR7-NEXT: xor r3, r3, r4 +; CHECK-PWR7-NEXT: sldi r27, r27, 56 ; CHECK-PWR7-NEXT: xor r5, r5, r6 -; CHECK-PWR7-NEXT: std r3, 272(r1) -; CHECK-PWR7-NEXT: std r3, 280(r1) -; CHECK-PWR7-NEXT: srawi r3, r17, 31 -; CHECK-PWR7-NEXT: sub r19, r19, r18 -; CHECK-PWR7-NEXT: xor r7, r7, r8 -; CHECK-PWR7-NEXT: sub r15, r15, r31 -; CHECK-PWR7-NEXT: xor r17, r17, r3 -; CHECK-PWR7-NEXT: xor r9, r9, r10 -; CHECK-PWR7-NEXT: xor r11, r11, r12 -; CHECK-PWR7-NEXT: xor r0, r0, r30 -; CHECK-PWR7-NEXT: xor r29, r29, r28 -; CHECK-PWR7-NEXT: xor r27, r27, r26 -; CHECK-PWR7-NEXT: sub r3, r17, r3 -; CHECK-PWR7-NEXT: xor r25, r25, r24 -; CHECK-PWR7-NEXT: sub r25, r25, r24 -; CHECK-PWR7-NEXT: sub r27, r27, r26 -; CHECK-PWR7-NEXT: sub r29, r29, r28 +; CHECK-PWR7-NEXT: sub r9, r9, r11 +; CHECK-PWR7-NEXT: sub r3, r3, r4 +; CHECK-PWR7-NEXT: sldi r24, r24, 56 ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: sub r0, r0, r30 -; CHECK-PWR7-NEXT: sub r11, r11, r12 -; CHECK-PWR7-NEXT: sub r9, r9, r10 -; CHECK-PWR7-NEXT: sub r7, r7, r8 -; CHECK-PWR7-NEXT: sub r5, r5, r6 -; CHECK-PWR7-NEXT: sldi r14, r14, 56 -; CHECK-PWR7-NEXT: sldi r15, r15, 56 -; CHECK-PWR7-NEXT: ld r31, 504(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r3, 256(r1) -; CHECK-PWR7-NEXT: std r3, 264(r1) -; CHECK-PWR7-NEXT: sldi r3, r19, 56 +; CHECK-PWR7-NEXT: srawi r11, r9, 31 +; CHECK-PWR7-NEXT: std r27, 208(r1) +; CHECK-PWR7-NEXT: sub r4, r5, r6 +; CHECK-PWR7-NEXT: std r27, 216(r1) +; CHECK-PWR7-NEXT: srawi r27, r29, 31 +; CHECK-PWR7-NEXT: lbz r10, 313(r1) +; CHECK-PWR7-NEXT: xor r9, r9, r11 +; CHECK-PWR7-NEXT: std r24, 224(r1) +; CHECK-PWR7-NEXT: lbz r22, 329(r1) +; CHECK-PWR7-NEXT: std r24, 232(r1) +; CHECK-PWR7-NEXT: srawi r24, r30, 31 +; CHECK-PWR7-NEXT: ld r21, 360(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sub r23, r9, r11 +; CHECK-PWR7-NEXT: lbz r9, 317(r1) +; CHECK-PWR7-NEXT: lbz r11, 333(r1) +; CHECK-PWR7-NEXT: xor r29, r29, r27 +; CHECK-PWR7-NEXT: std r3, 176(r1) +; CHECK-PWR7-NEXT: std r3, 184(r1) +; CHECK-PWR7-NEXT: sldi r3, r4, 56 +; CHECK-PWR7-NEXT: sldi r23, r23, 56 +; CHECK-PWR7-NEXT: xor r30, r30, r24 +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR7-NEXT: sub r4, r30, r24 +; CHECK-PWR7-NEXT: ld r30, 432(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r3, 160(r1) +; CHECK-PWR7-NEXT: std r3, 168(r1) +; CHECK-PWR7-NEXT: sub r9, r9, r11 +; CHECK-PWR7-NEXT: sub r3, r29, r27 +; CHECK-PWR7-NEXT: std r23, 240(r1) +; CHECK-PWR7-NEXT: ld r29, 424(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: srawi r11, r9, 31 +; CHECK-PWR7-NEXT: std r23, 248(r1) +; CHECK-PWR7-NEXT: ld r27, 408(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: srawi r23, r28, 31 +; CHECK-PWR7-NEXT: sldi r3, r3, 56 +; CHECK-PWR7-NEXT: xor r28, r28, r23 +; CHECK-PWR7-NEXT: xor r9, r9, r11 +; CHECK-PWR7-NEXT: std r3, 144(r1) +; CHECK-PWR7-NEXT: ld r24, 384(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r3, 152(r1) +; CHECK-PWR7-NEXT: sldi r3, r4, 56 +; CHECK-PWR7-NEXT: sub r25, r9, r11 +; CHECK-PWR7-NEXT: lbz r9, 318(r1) +; CHECK-PWR7-NEXT: lbz r11, 334(r1) +; CHECK-PWR7-NEXT: std r3, 128(r1) ; CHECK-PWR7-NEXT: sldi r25, r25, 56 -; CHECK-PWR7-NEXT: sldi r27, r27, 56 -; CHECK-PWR7-NEXT: std r3, 240(r1) -; CHECK-PWR7-NEXT: std r3, 248(r1) -; CHECK-PWR7-NEXT: sub r3, r23, r22 -; CHECK-PWR7-NEXT: srawi r23, r3, 31 -; CHECK-PWR7-NEXT: sub r22, r21, r20 -; CHECK-PWR7-NEXT: srawi r21, r22, 31 -; CHECK-PWR7-NEXT: sldi r29, r29, 56 -; CHECK-PWR7-NEXT: sldi r0, r0, 56 -; CHECK-PWR7-NEXT: sldi r11, r11, 56 -; CHECK-PWR7-NEXT: xor r3, r3, r23 -; CHECK-PWR7-NEXT: xor r22, r22, r21 -; CHECK-PWR7-NEXT: sldi r9, r9, 56 -; CHECK-PWR7-NEXT: sldi r7, r7, 56 -; CHECK-PWR7-NEXT: sldi r5, r5, 56 -; CHECK-PWR7-NEXT: ld r30, 496(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: ld r28, 480(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: sub r3, r3, r23 -; CHECK-PWR7-NEXT: sub r22, r22, r21 -; CHECK-PWR7-NEXT: std r14, 304(r1) -; CHECK-PWR7-NEXT: ld r26, 464(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r3, 136(r1) +; CHECK-PWR7-NEXT: sub r3, r28, r23 ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: sldi r22, r22, 56 -; CHECK-PWR7-NEXT: ld r24, 448(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: ld r23, 440(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r14, 312(r1) -; CHECK-PWR7-NEXT: std r15, 288(r1) -; CHECK-PWR7-NEXT: std r3, 208(r1) -; CHECK-PWR7-NEXT: std r3, 216(r1) -; CHECK-PWR7-NEXT: lwz r3, 60(r1) # 4-byte Folded Reload -; CHECK-PWR7-NEXT: std r15, 296(r1) -; CHECK-PWR7-NEXT: ld r21, 424(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: ld r20, 416(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r22, 224(r1) -; CHECK-PWR7-NEXT: std r22, 232(r1) -; CHECK-PWR7-NEXT: sub r4, r3, r4 -; CHECK-PWR7-NEXT: std r25, 192(r1) -; CHECK-PWR7-NEXT: ld r22, 432(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: ld r19, 408(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: srawi r3, r4, 31 -; CHECK-PWR7-NEXT: std r25, 200(r1) -; CHECK-PWR7-NEXT: ld r25, 456(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r27, 176(r1) -; CHECK-PWR7-NEXT: std r27, 184(r1) -; CHECK-PWR7-NEXT: xor r4, r4, r3 -; CHECK-PWR7-NEXT: std r29, 160(r1) -; CHECK-PWR7-NEXT: ld r27, 472(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r29, 168(r1) -; CHECK-PWR7-NEXT: std r0, 144(r1) -; CHECK-PWR7-NEXT: sub r3, r4, r3 -; CHECK-PWR7-NEXT: std r0, 152(r1) -; CHECK-PWR7-NEXT: ld r29, 488(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: ld r18, 400(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: std r3, 112(r1) +; CHECK-PWR7-NEXT: ld r28, 416(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR7-NEXT: clrlwi r10, r10, 24 +; CHECK-PWR7-NEXT: std r25, 256(r1) +; CHECK-PWR7-NEXT: std r25, 264(r1) +; CHECK-PWR7-NEXT: sub r9, r9, r11 +; CHECK-PWR7-NEXT: srawi r25, r26, 31 +; CHECK-PWR7-NEXT: xor r26, r26, r25 +; CHECK-PWR7-NEXT: ld r23, 376(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: srawi r11, r9, 31 +; CHECK-PWR7-NEXT: std r3, 120(r1) +; CHECK-PWR7-NEXT: sub r4, r26, r25 +; CHECK-PWR7-NEXT: clrlwi r22, r22, 24 +; CHECK-PWR7-NEXT: srawi r7, r8, 31 +; CHECK-PWR7-NEXT: sub r10, r10, r22 +; CHECK-PWR7-NEXT: ld r26, 400(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: xor r9, r9, r11 +; CHECK-PWR7-NEXT: sldi r3, r4, 56 +; CHECK-PWR7-NEXT: srawi r22, r10, 31 +; CHECK-PWR7-NEXT: xor r8, r8, r7 +; CHECK-PWR7-NEXT: xor r10, r10, r22 +; CHECK-PWR7-NEXT: sub r10, r10, r22 +; CHECK-PWR7-NEXT: ld r25, 392(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sub r12, r9, r11 +; CHECK-PWR7-NEXT: lbz r9, 319(r1) +; CHECK-PWR7-NEXT: lbz r11, 335(r1) +; CHECK-PWR7-NEXT: std r3, 96(r1) +; CHECK-PWR7-NEXT: sldi r12, r12, 56 +; CHECK-PWR7-NEXT: std r3, 104(r1) +; CHECK-PWR7-NEXT: ld r22, 368(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sldi r10, r10, 56 +; CHECK-PWR7-NEXT: std r10, 192(r1) +; CHECK-PWR7-NEXT: clrlwi r9, r9, 24 +; CHECK-PWR7-NEXT: clrlwi r11, r11, 24 +; CHECK-PWR7-NEXT: sub r9, r9, r11 +; CHECK-PWR7-NEXT: std r12, 272(r1) +; CHECK-PWR7-NEXT: std r12, 280(r1) +; CHECK-PWR7-NEXT: srawi r12, r19, 31 +; CHECK-PWR7-NEXT: xor r0, r19, r12 +; CHECK-PWR7-NEXT: ld r19, 344(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sub r3, r0, r12 +; CHECK-PWR7-NEXT: srawi r11, r9, 31 +; CHECK-PWR7-NEXT: std r10, 200(r1) +; CHECK-PWR7-NEXT: xor r9, r9, r11 ; CHECK-PWR7-NEXT: sldi r3, r3, 56 -; CHECK-PWR7-NEXT: std r11, 128(r1) -; CHECK-PWR7-NEXT: ld r17, 392(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r11, 136(r1) -; CHECK-PWR7-NEXT: std r9, 112(r1) +; CHECK-PWR7-NEXT: sub r9, r9, r11 +; CHECK-PWR7-NEXT: std r3, 80(r1) +; CHECK-PWR7-NEXT: std r3, 88(r1) +; CHECK-PWR7-NEXT: sldi r9, r9, 56 +; CHECK-PWR7-NEXT: std r9, 288(r1) +; CHECK-PWR7-NEXT: std r9, 296(r1) +; CHECK-PWR7-NEXT: srawi r9, r20, 31 +; CHECK-PWR7-NEXT: xor r11, r20, r9 +; CHECK-PWR7-NEXT: ld r20, 352(r1) # 8-byte Folded Reload +; CHECK-PWR7-NEXT: sub r4, r11, r9 +; CHECK-PWR7-NEXT: sldi r3, r4, 56 ; CHECK-PWR7-NEXT: std r3, 64(r1) ; CHECK-PWR7-NEXT: std r3, 72(r1) -; CHECK-PWR7-NEXT: addi r3, r1, 304 -; CHECK-PWR7-NEXT: std r9, 120(r1) -; CHECK-PWR7-NEXT: ld r15, 376(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: std r7, 96(r1) -; CHECK-PWR7-NEXT: std r7, 104(r1) -; CHECK-PWR7-NEXT: std r5, 80(r1) -; CHECK-PWR7-NEXT: std r5, 88(r1) -; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3 +; CHECK-PWR7-NEXT: sub r3, r8, r7 +; CHECK-PWR7-NEXT: sldi r3, r3, 56 +; CHECK-PWR7-NEXT: std r3, 48(r1) +; CHECK-PWR7-NEXT: std r3, 56(r1) ; CHECK-PWR7-NEXT: addi r3, r1, 288 -; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 +; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 272 -; CHECK-PWR7-NEXT: ld r14, 368(r1) # 8-byte Folded Reload -; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 256 -; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 +; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 +; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 240 +; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 +; CHECK-PWR7-NEXT: addi r3, r1, 224 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: vmrghh v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 224 -; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 208 -; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 192 -; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3 +; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 +; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 176 +; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3 +; CHECK-PWR7-NEXT: addi r3, r1, 160 ; CHECK-PWR7-NEXT: vmrghb v4, v5, v4 ; CHECK-PWR7-NEXT: vmrghh v3, v4, v3 ; CHECK-PWR7-NEXT: xxmrghw vs0, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v2, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 160 -; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 144 -; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 128 +; CHECK-PWR7-NEXT: vmrghb v2, v3, v2 +; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 +; CHECK-PWR7-NEXT: addi r3, r1, 112 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 +; CHECK-PWR7-NEXT: addi r3, r1, 96 ; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 -; CHECK-PWR7-NEXT: addi r3, r1, 112 ; CHECK-PWR7-NEXT: vmrghh v2, v3, v2 ; CHECK-PWR7-NEXT: lxvw4x v3, 0, r3 -; CHECK-PWR7-NEXT: addi r3, r1, 96 -; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 80 -; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 ; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 ; CHECK-PWR7-NEXT: addi r3, r1, 64 +; CHECK-PWR7-NEXT: vmrghb v3, v4, v3 +; CHECK-PWR7-NEXT: lxvw4x v4, 0, r3 +; CHECK-PWR7-NEXT: addi r3, r1, 48 ; CHECK-PWR7-NEXT: lxvw4x v5, 0, r3 ; CHECK-PWR7-NEXT: vmrghb v4, v5, v4 ; CHECK-PWR7-NEXT: vmrghh v3, v4, v3 ; CHECK-PWR7-NEXT: xxmrghw vs1, v3, v2 ; CHECK-PWR7-NEXT: xxmrghd v2, vs1, vs0 -; CHECK-PWR7-NEXT: addi r1, r1, 512 +; CHECK-PWR7-NEXT: addi r1, r1, 448 ; CHECK-PWR7-NEXT: blr entry: %vecext = extractelement <16 x i8> %a, i32 0 diff --git a/llvm/test/CodeGen/RISCV/fpclamptosat.ll b/llvm/test/CodeGen/RISCV/fpclamptosat.ll index 117e3e4aac45d..246e6a614d6aa 100644 --- a/llvm/test/CodeGen/RISCV/fpclamptosat.ll +++ b/llvm/test/CodeGen/RISCV/fpclamptosat.ll @@ -3292,30 +3292,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IF-NEXT: mv a1, a0 ; RV32IF-NEXT: addi a0, sp, 8 ; RV32IF-NEXT: call __fixdfti -; RV32IF-NEXT: lw a0, 20(sp) -; RV32IF-NEXT: lw a1, 8(sp) -; RV32IF-NEXT: lw a2, 12(sp) +; RV32IF-NEXT: lw a0, 8(sp) +; RV32IF-NEXT: lw a1, 12(sp) +; RV32IF-NEXT: lw a2, 20(sp) ; RV32IF-NEXT: lw a3, 16(sp) -; RV32IF-NEXT: beqz a0, .LBB47_2 +; RV32IF-NEXT: beqz a2, .LBB47_2 ; RV32IF-NEXT: # %bb.1: # %entry -; RV32IF-NEXT: slti a4, a0, 0 +; RV32IF-NEXT: slti a4, a2, 0 ; RV32IF-NEXT: j .LBB47_3 ; RV32IF-NEXT: .LBB47_2: ; RV32IF-NEXT: seqz a4, a3 ; RV32IF-NEXT: .LBB47_3: # %entry ; RV32IF-NEXT: xori a3, a3, 1 -; RV32IF-NEXT: or a3, a3, a0 +; RV32IF-NEXT: or a3, a3, a2 ; RV32IF-NEXT: seqz a3, a3 ; RV32IF-NEXT: addi a3, a3, -1 ; RV32IF-NEXT: and a3, a3, a4 ; RV32IF-NEXT: neg a3, a3 -; RV32IF-NEXT: and a2, a3, a2 ; RV32IF-NEXT: and a1, a3, a1 ; RV32IF-NEXT: and a0, a3, a0 -; RV32IF-NEXT: slti a0, a0, 0 -; RV32IF-NEXT: addi a3, a0, -1 -; RV32IF-NEXT: and a0, a3, a1 -; RV32IF-NEXT: and a1, a3, a2 +; RV32IF-NEXT: and a2, a3, a2 +; RV32IF-NEXT: slti a2, a2, 0 +; RV32IF-NEXT: addi a2, a2, -1 +; RV32IF-NEXT: and a0, a2, a0 +; RV32IF-NEXT: and a1, a2, a1 ; RV32IF-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IF-NEXT: .cfi_restore ra ; RV32IF-NEXT: addi sp, sp, 32 @@ -3354,30 +3354,30 @@ define i64 @ustest_f64i64_mm(double %x) { ; RV32IFD-NEXT: .cfi_offset ra, -4 ; RV32IFD-NEXT: addi a0, sp, 8 ; RV32IFD-NEXT: call __fixdfti -; RV32IFD-NEXT: lw a0, 20(sp) -; RV32IFD-NEXT: lw a1, 8(sp) -; RV32IFD-NEXT: lw a2, 12(sp) +; RV32IFD-NEXT: lw a0, 8(sp) +; RV32IFD-NEXT: lw a1, 12(sp) +; RV32IFD-NEXT: lw a2, 20(sp) ; RV32IFD-NEXT: lw a3, 16(sp) -; RV32IFD-NEXT: beqz a0, .LBB47_2 +; RV32IFD-NEXT: beqz a2, .LBB47_2 ; RV32IFD-NEXT: # %bb.1: # %entry -; RV32IFD-NEXT: slti a4, a0, 0 +; RV32IFD-NEXT: slti a4, a2, 0 ; RV32IFD-NEXT: j .LBB47_3 ; RV32IFD-NEXT: .LBB47_2: ; RV32IFD-NEXT: seqz a4, a3 ; RV32IFD-NEXT: .LBB47_3: # %entry ; RV32IFD-NEXT: xori a3, a3, 1 -; RV32IFD-NEXT: or a3, a3, a0 +; RV32IFD-NEXT: or a3, a3, a2 ; RV32IFD-NEXT: seqz a3, a3 ; RV32IFD-NEXT: addi a3, a3, -1 ; RV32IFD-NEXT: and a3, a3, a4 ; RV32IFD-NEXT: neg a3, a3 -; RV32IFD-NEXT: and a2, a3, a2 ; RV32IFD-NEXT: and a1, a3, a1 ; RV32IFD-NEXT: and a0, a3, a0 -; RV32IFD-NEXT: slti a0, a0, 0 -; RV32IFD-NEXT: addi a3, a0, -1 -; RV32IFD-NEXT: and a0, a3, a1 -; RV32IFD-NEXT: and a1, a3, a2 +; RV32IFD-NEXT: and a2, a3, a2 +; RV32IFD-NEXT: slti a2, a2, 0 +; RV32IFD-NEXT: addi a2, a2, -1 +; RV32IFD-NEXT: and a0, a2, a0 +; RV32IFD-NEXT: and a1, a2, a1 ; RV32IFD-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32IFD-NEXT: .cfi_restore ra ; RV32IFD-NEXT: addi sp, sp, 32 @@ -3530,30 +3530,30 @@ define i64 @ustest_f32i64_mm(float %x) { ; RV32-NEXT: .cfi_offset ra, -4 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) -; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB50_2 +; RV32-NEXT: beqz a2, .LBB50_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB50_3 ; RV32-NEXT: .LBB50_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB50_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a2 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: slti a2, a2, 0 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 @@ -3767,30 +3767,30 @@ define i64 @ustest_f16i64_mm(half %x) { ; RV32-NEXT: call __extendhfsf2 ; RV32-NEXT: addi a0, sp, 8 ; RV32-NEXT: call __fixsfti -; RV32-NEXT: lw a0, 20(sp) -; RV32-NEXT: lw a1, 8(sp) -; RV32-NEXT: lw a2, 12(sp) +; RV32-NEXT: lw a0, 8(sp) +; RV32-NEXT: lw a1, 12(sp) +; RV32-NEXT: lw a2, 20(sp) ; RV32-NEXT: lw a3, 16(sp) -; RV32-NEXT: beqz a0, .LBB53_2 +; RV32-NEXT: beqz a2, .LBB53_2 ; RV32-NEXT: # %bb.1: # %entry -; RV32-NEXT: slti a4, a0, 0 +; RV32-NEXT: slti a4, a2, 0 ; RV32-NEXT: j .LBB53_3 ; RV32-NEXT: .LBB53_2: ; RV32-NEXT: seqz a4, a3 ; RV32-NEXT: .LBB53_3: # %entry ; RV32-NEXT: xori a3, a3, 1 -; RV32-NEXT: or a3, a3, a0 +; RV32-NEXT: or a3, a3, a2 ; RV32-NEXT: seqz a3, a3 ; RV32-NEXT: addi a3, a3, -1 ; RV32-NEXT: and a3, a3, a4 ; RV32-NEXT: neg a3, a3 -; RV32-NEXT: and a2, a3, a2 ; RV32-NEXT: and a1, a3, a1 ; RV32-NEXT: and a0, a3, a0 -; RV32-NEXT: slti a0, a0, 0 -; RV32-NEXT: addi a3, a0, -1 -; RV32-NEXT: and a0, a3, a1 -; RV32-NEXT: and a1, a3, a2 +; RV32-NEXT: and a2, a3, a2 +; RV32-NEXT: slti a2, a2, 0 +; RV32-NEXT: addi a2, a2, -1 +; RV32-NEXT: and a0, a2, a0 +; RV32-NEXT: and a1, a2, a1 ; RV32-NEXT: lw ra, 28(sp) # 4-byte Folded Reload ; RV32-NEXT: .cfi_restore ra ; RV32-NEXT: addi sp, sp, 32 diff --git a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll index 97d102561129d..b1a6d163664e5 100644 --- a/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll +++ b/llvm/test/CodeGen/RISCV/intrinsic-cttz-elts-vscale.ll @@ -7,18 +7,18 @@ define i32 @ctz_nxv4i32( %a) #0 { ; RV32-LABEL: ctz_nxv4i32: ; RV32: # %bb.0: -; RV32-NEXT: csrr a0, vlenb -; RV32-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV32-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; RV32-NEXT: vid.v v10 -; RV32-NEXT: li a1, -1 +; RV32-NEXT: vmv.v.i v11, -1 +; RV32-NEXT: csrr a0, vlenb ; RV32-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV32-NEXT: vmsne.vi v0, v8, 0 ; RV32-NEXT: srli a0, a0, 1 ; RV32-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV32-NEXT: vmv.v.x v8, a0 -; RV32-NEXT: vmadd.vx v10, a1, v8 -; RV32-NEXT: vmv.v.i v8, 0 -; RV32-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV32-NEXT: vmacc.vv v8, v10, v11 +; RV32-NEXT: vmv.v.i v9, 0 +; RV32-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV32-NEXT: vredmaxu.vs v8, v8, v8 ; RV32-NEXT: vmv.x.s a1, v8 ; RV32-NEXT: sub a0, a0, a1 @@ -28,18 +28,18 @@ define i32 @ctz_nxv4i32( %a) #0 { ; ; RV64-LABEL: ctz_nxv4i32: ; RV64: # %bb.0: -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e16, m1, ta, ma +; RV64-NEXT: vsetvli a0, zero, e16, m1, ta, ma ; RV64-NEXT: vid.v v10 -; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmv.v.i v11, -1 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: srli a0, a0, 1 ; RV64-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v10, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v10, v0 +; RV64-NEXT: vmacc.vv v8, v10, v11 +; RV64-NEXT: vmv.v.i v9, 0 +; RV64-NEXT: vmerge.vvm v8, v9, v8, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: subw a0, a0, a1 @@ -109,17 +109,17 @@ define i64 @ctz_nxv8i1_no_range( %a) { ; ; RV64-LABEL: ctz_nxv8i1_no_range: ; RV64: # %bb.0: -; RV64-NEXT: csrr a0, vlenb -; RV64-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; RV64-NEXT: vsetvli a0, zero, e64, m8, ta, ma ; RV64-NEXT: vid.v v16 -; RV64-NEXT: li a1, -1 +; RV64-NEXT: vmv.v.i v24, -1 +; RV64-NEXT: csrr a0, vlenb ; RV64-NEXT: vsetvli zero, zero, e16, m2, ta, ma ; RV64-NEXT: vmsne.vi v0, v8, 0 ; RV64-NEXT: vsetvli zero, zero, e64, m8, ta, ma ; RV64-NEXT: vmv.v.x v8, a0 -; RV64-NEXT: vmadd.vx v16, a1, v8 -; RV64-NEXT: vmv.v.i v8, 0 -; RV64-NEXT: vmerge.vvm v8, v8, v16, v0 +; RV64-NEXT: vmacc.vv v8, v16, v24 +; RV64-NEXT: vmv.v.i v16, 0 +; RV64-NEXT: vmerge.vvm v8, v16, v8, v0 ; RV64-NEXT: vredmaxu.vs v8, v8, v8 ; RV64-NEXT: vmv.x.s a1, v8 ; RV64-NEXT: sub a0, a0, a1 diff --git a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll index cd7f30d8f5898..32753ca382fc7 100644 --- a/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/RISCV/wide-scalar-shift-legalization.ll @@ -716,101 +716,92 @@ define void @lshr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 8(a0) -; RV32I-NEXT: lbu a6, 9(a0) -; RV32I-NEXT: lbu t3, 10(a0) -; RV32I-NEXT: lbu t4, 11(a0) ; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t0, a7 +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 9(a0) +; RV32I-NEXT: lbu t0, 10(a0) +; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t2, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: lbu t2, 0(a1) -; RV32I-NEXT: lbu t4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, t3, t0 +; RV32I-NEXT: lbu t0, 12(a0) +; RV32I-NEXT: lbu t2, 13(a0) +; RV32I-NEXT: lbu t3, 14(a0) +; RV32I-NEXT: lbu t4, 15(a0) +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t4 -; RV32I-NEXT: mv t2, sp -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, t0, a7 -; RV32I-NEXT: or a5, t3, a5 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: or a1, a1, t1 -; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a4, 4(sp) -; RV32I-NEXT: sw a5, 8(sp) -; RV32I-NEXT: sw a0, 12(sp) -; RV32I-NEXT: srli a0, a1, 3 -; RV32I-NEXT: andi a3, a1, 31 -; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: xori a3, a3, 31 -; RV32I-NEXT: add a0, t2, a0 -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: lw a5, 8(a0) -; RV32I-NEXT: lw a6, 0(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a7, a4, a1 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: srl a6, a6, a1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: srl a5, a5, a1 -; RV32I-NEXT: slli t1, a0, 1 -; RV32I-NEXT: srl a0, a0, a1 -; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a4, a4, a3 -; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a1, t2, t0 +; RV32I-NEXT: mv t0, sp +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: srli t3, a0, 3 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: andi a5, a0, 31 +; RV32I-NEXT: andi t3, t3, 12 +; RV32I-NEXT: xori a5, a5, 31 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a1, t2, a1 +; RV32I-NEXT: add t0, t0, t3 +; RV32I-NEXT: sw a4, 0(sp) +; RV32I-NEXT: sw a3, 4(sp) +; RV32I-NEXT: sw a6, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: lw a1, 4(t0) +; RV32I-NEXT: lw a3, 8(t0) +; RV32I-NEXT: lw a4, 0(t0) +; RV32I-NEXT: lw a6, 12(t0) +; RV32I-NEXT: srl a7, a1, a0 +; RV32I-NEXT: slli t0, a3, 1 +; RV32I-NEXT: srl a4, a4, a0 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli t1, a6, 1 +; RV32I-NEXT: srl a0, a6, a0 +; RV32I-NEXT: sll a6, t0, a5 +; RV32I-NEXT: sll a1, a1, a5 +; RV32I-NEXT: sll a5, t1, a5 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 -; RV32I-NEXT: or a1, a7, a1 -; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a1, a4, a1 +; RV32I-NEXT: or a3, a3, a5 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) ; RV32I-NEXT: sb t1, 15(a2) ; RV32I-NEXT: srli a0, a3, 16 -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: srli a6, a3, 8 -; RV32I-NEXT: srli a7, a4, 16 -; RV32I-NEXT: srli t0, a4, 24 -; RV32I-NEXT: srli t1, a4, 8 -; RV32I-NEXT: srli t2, a1, 16 -; RV32I-NEXT: srli t3, a1, 24 +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: srli a5, a3, 8 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t0, a1, 24 +; RV32I-NEXT: srli t1, a1, 8 +; RV32I-NEXT: srli t2, a6, 16 +; RV32I-NEXT: srli t3, a6, 24 ; RV32I-NEXT: sb a3, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) +; RV32I-NEXT: sb a5, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a0, a1, 8 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a0, a6, 8 +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t1, 1(a2) ; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a6, 4(a2) ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) @@ -952,102 +943,93 @@ define void @shl_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, a6, a5 -; RV32I-NEXT: lbu a5, 8(a0) -; RV32I-NEXT: lbu a6, 9(a0) -; RV32I-NEXT: lbu t3, 10(a0) -; RV32I-NEXT: lbu t4, 11(a0) ; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a4, a4, a3 +; RV32I-NEXT: or a5, a6, a5 +; RV32I-NEXT: or a3, t0, a7 +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 9(a0) +; RV32I-NEXT: lbu t0, 10(a0) +; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli a6, a6, 8 -; RV32I-NEXT: or a7, t0, a7 -; RV32I-NEXT: or t0, t2, t1 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: lbu a6, 12(a0) -; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t2, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t3, t3, 16 -; RV32I-NEXT: slli t4, t4, 24 -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t3, t4, t3 -; RV32I-NEXT: or a6, t1, a6 -; RV32I-NEXT: or a0, a0, t2 -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: lbu t2, 0(a1) -; RV32I-NEXT: lbu t4, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t2 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, t3, t0 +; RV32I-NEXT: lbu t0, 12(a0) +; RV32I-NEXT: lbu t2, 13(a0) +; RV32I-NEXT: lbu t3, 14(a0) +; RV32I-NEXT: lbu t4, 15(a0) +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: sw zero, 0(sp) ; RV32I-NEXT: sw zero, 4(sp) ; RV32I-NEXT: sw zero, 8(sp) ; RV32I-NEXT: sw zero, 12(sp) -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t4 -; RV32I-NEXT: addi t2, sp, 16 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a1, t2, t0 +; RV32I-NEXT: addi t0, sp, 16 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: srli t3, a0, 3 +; RV32I-NEXT: or a4, a5, a4 +; RV32I-NEXT: andi a5, a0, 31 +; RV32I-NEXT: andi t3, t3, 12 +; RV32I-NEXT: or a3, t1, a3 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a1, t2, a1 +; RV32I-NEXT: sub a7, t0, t3 +; RV32I-NEXT: sw a4, 16(sp) +; RV32I-NEXT: sw a3, 20(sp) +; RV32I-NEXT: sw a6, 24(sp) +; RV32I-NEXT: sw a1, 28(sp) +; RV32I-NEXT: lw a1, 0(a7) +; RV32I-NEXT: lw a3, 4(a7) +; RV32I-NEXT: lw a4, 8(a7) +; RV32I-NEXT: lw a6, 12(a7) +; RV32I-NEXT: xori a5, a5, 31 +; RV32I-NEXT: sll a7, a3, a0 +; RV32I-NEXT: srli t0, a1, 1 +; RV32I-NEXT: sll a6, a6, a0 +; RV32I-NEXT: srli t1, a4, 1 +; RV32I-NEXT: sll a4, a4, a0 +; RV32I-NEXT: srli a3, a3, 1 +; RV32I-NEXT: sll a0, a1, a0 +; RV32I-NEXT: srl a1, t0, a5 +; RV32I-NEXT: srl t0, t1, a5 +; RV32I-NEXT: srl a3, a3, a5 +; RV32I-NEXT: srli a5, a0, 16 +; RV32I-NEXT: srli t1, a0, 24 +; RV32I-NEXT: srli t2, a0, 8 +; RV32I-NEXT: or a1, a7, a1 +; RV32I-NEXT: or a6, a6, t0 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: or a4, t0, a7 -; RV32I-NEXT: or a5, t3, a5 -; RV32I-NEXT: or a0, a0, a6 -; RV32I-NEXT: or a1, a1, t1 -; RV32I-NEXT: sw a3, 16(sp) -; RV32I-NEXT: sw a4, 20(sp) -; RV32I-NEXT: sw a5, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) -; RV32I-NEXT: srli a0, a1, 3 -; RV32I-NEXT: andi a3, a1, 31 -; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: sub a0, t2, a0 -; RV32I-NEXT: lw a4, 0(a0) -; RV32I-NEXT: lw a5, 4(a0) -; RV32I-NEXT: lw a6, 8(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: xori a3, a3, 31 -; RV32I-NEXT: sll a7, a5, a1 -; RV32I-NEXT: srli t0, a4, 1 -; RV32I-NEXT: sll a0, a0, a1 -; RV32I-NEXT: srli t1, a6, 1 -; RV32I-NEXT: sll a6, a6, a1 -; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: sll a1, a4, a1 -; RV32I-NEXT: srl a4, t0, a3 -; RV32I-NEXT: srl t0, t1, a3 -; RV32I-NEXT: srl a3, a5, a3 -; RV32I-NEXT: srli a5, a1, 16 -; RV32I-NEXT: srli t1, a1, 24 -; RV32I-NEXT: srli t2, a1, 8 -; RV32I-NEXT: or a4, a7, a4 -; RV32I-NEXT: or a0, a0, t0 -; RV32I-NEXT: or a3, a6, a3 -; RV32I-NEXT: sb a1, 0(a2) +; RV32I-NEXT: sb a0, 0(a2) ; RV32I-NEXT: sb t2, 1(a2) ; RV32I-NEXT: sb a5, 2(a2) ; RV32I-NEXT: sb t1, 3(a2) -; RV32I-NEXT: srli a1, a3, 16 -; RV32I-NEXT: srli a5, a3, 24 -; RV32I-NEXT: srli a6, a3, 8 -; RV32I-NEXT: srli a7, a0, 16 -; RV32I-NEXT: srli t0, a0, 24 -; RV32I-NEXT: srli t1, a0, 8 -; RV32I-NEXT: srli t2, a4, 16 -; RV32I-NEXT: srli t3, a4, 24 +; RV32I-NEXT: srli a0, a3, 16 +; RV32I-NEXT: srli a4, a3, 24 +; RV32I-NEXT: srli a5, a3, 8 +; RV32I-NEXT: srli a7, a6, 16 +; RV32I-NEXT: srli t0, a6, 24 +; RV32I-NEXT: srli t1, a6, 8 +; RV32I-NEXT: srli t2, a1, 16 +; RV32I-NEXT: srli t3, a1, 24 ; RV32I-NEXT: sb a3, 8(a2) -; RV32I-NEXT: sb a6, 9(a2) -; RV32I-NEXT: sb a1, 10(a2) -; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a1, a4, 8 -; RV32I-NEXT: sb a0, 12(a2) +; RV32I-NEXT: sb a5, 9(a2) +; RV32I-NEXT: sb a0, 10(a2) +; RV32I-NEXT: sb a4, 11(a2) +; RV32I-NEXT: srli a0, a1, 8 +; RV32I-NEXT: sb a6, 12(a2) ; RV32I-NEXT: sb t1, 13(a2) ; RV32I-NEXT: sb a7, 14(a2) ; RV32I-NEXT: sb t0, 15(a2) -; RV32I-NEXT: sb a4, 4(a2) -; RV32I-NEXT: sb a1, 5(a2) +; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) ; RV32I-NEXT: addi sp, sp, 32 @@ -1186,82 +1168,73 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu t1, 6(a0) ; RV32I-NEXT: lbu t2, 7(a0) ; RV32I-NEXT: slli a4, a4, 8 -; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: lbu a4, 8(a0) -; RV32I-NEXT: lbu t3, 9(a0) -; RV32I-NEXT: lbu t4, 10(a0) -; RV32I-NEXT: lbu t5, 11(a0) ; RV32I-NEXT: slli a5, a5, 16 ; RV32I-NEXT: slli a6, a6, 24 ; RV32I-NEXT: slli t0, t0, 8 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: lbu a6, 8(a0) +; RV32I-NEXT: lbu a7, 9(a0) +; RV32I-NEXT: lbu t0, 10(a0) +; RV32I-NEXT: lbu t3, 11(a0) ; RV32I-NEXT: slli t1, t1, 16 ; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: slli a7, a7, 8 +; RV32I-NEXT: slli t0, t0, 16 +; RV32I-NEXT: slli t3, t3, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: or a6, a7, a6 +; RV32I-NEXT: or a7, t3, t0 ; RV32I-NEXT: lbu t0, 12(a0) -; RV32I-NEXT: lbu t1, 13(a0) -; RV32I-NEXT: lbu t2, 14(a0) -; RV32I-NEXT: lbu a0, 15(a0) -; RV32I-NEXT: slli t3, t3, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or a4, t3, a4 -; RV32I-NEXT: or t3, t5, t4 -; RV32I-NEXT: or t0, t1, t0 -; RV32I-NEXT: lbu t1, 1(a1) -; RV32I-NEXT: lbu t4, 0(a1) -; RV32I-NEXT: lbu t5, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli t1, t1, 8 -; RV32I-NEXT: or t1, t1, t4 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or a1, a1, t5 -; RV32I-NEXT: or a3, a5, a3 -; RV32I-NEXT: mv a5, sp -; RV32I-NEXT: slli t2, t2, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: or t2, a0, t2 -; RV32I-NEXT: srai a0, a0, 31 +; RV32I-NEXT: lbu t2, 13(a0) +; RV32I-NEXT: lbu t3, 14(a0) +; RV32I-NEXT: lbu t4, 15(a0) +; RV32I-NEXT: lbu a0, 0(a1) +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: or a1, t2, t0 +; RV32I-NEXT: mv t0, sp +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: srli a4, a0, 3 +; RV32I-NEXT: or a5, t1, a5 +; RV32I-NEXT: andi t1, a0, 31 +; RV32I-NEXT: or t2, t4, t3 +; RV32I-NEXT: srai t3, t4, 31 +; RV32I-NEXT: andi a4, a4, 12 +; RV32I-NEXT: xori t1, t1, 31 ; RV32I-NEXT: or a6, a7, a6 -; RV32I-NEXT: or a4, t3, a4 -; RV32I-NEXT: or a7, t2, t0 -; RV32I-NEXT: or a1, a1, t1 -; RV32I-NEXT: sw a0, 16(sp) -; RV32I-NEXT: sw a0, 20(sp) -; RV32I-NEXT: sw a0, 24(sp) -; RV32I-NEXT: sw a0, 28(sp) +; RV32I-NEXT: or a1, t2, a1 +; RV32I-NEXT: sw t3, 16(sp) +; RV32I-NEXT: sw t3, 20(sp) +; RV32I-NEXT: sw t3, 24(sp) +; RV32I-NEXT: sw t3, 28(sp) +; RV32I-NEXT: add a4, t0, a4 ; RV32I-NEXT: sw a3, 0(sp) -; RV32I-NEXT: sw a6, 4(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a7, 12(sp) -; RV32I-NEXT: srli a0, a1, 3 -; RV32I-NEXT: andi a3, a1, 31 -; RV32I-NEXT: andi a0, a0, 12 -; RV32I-NEXT: xori a3, a3, 31 -; RV32I-NEXT: add a0, a5, a0 -; RV32I-NEXT: lw a4, 4(a0) -; RV32I-NEXT: lw a5, 8(a0) -; RV32I-NEXT: lw a6, 0(a0) -; RV32I-NEXT: lw a0, 12(a0) -; RV32I-NEXT: srl a7, a4, a1 -; RV32I-NEXT: slli t0, a5, 1 -; RV32I-NEXT: srl a6, a6, a1 -; RV32I-NEXT: slli a4, a4, 1 -; RV32I-NEXT: srl a5, a5, a1 -; RV32I-NEXT: slli t1, a0, 1 -; RV32I-NEXT: sra a0, a0, a1 -; RV32I-NEXT: sll a1, t0, a3 -; RV32I-NEXT: sll a4, a4, a3 -; RV32I-NEXT: sll a3, t1, a3 +; RV32I-NEXT: sw a5, 4(sp) +; RV32I-NEXT: sw a6, 8(sp) +; RV32I-NEXT: sw a1, 12(sp) +; RV32I-NEXT: lw a1, 4(a4) +; RV32I-NEXT: lw a3, 8(a4) +; RV32I-NEXT: lw a5, 0(a4) +; RV32I-NEXT: lw a4, 12(a4) +; RV32I-NEXT: srl a6, a1, a0 +; RV32I-NEXT: slli a7, a3, 1 +; RV32I-NEXT: srl a5, a5, a0 +; RV32I-NEXT: slli a1, a1, 1 +; RV32I-NEXT: srl a3, a3, a0 +; RV32I-NEXT: slli t0, a4, 1 +; RV32I-NEXT: sra a0, a4, a0 +; RV32I-NEXT: sll a4, a7, t1 +; RV32I-NEXT: sll a1, a1, t1 +; RV32I-NEXT: sll a7, t0, t1 ; RV32I-NEXT: srli t0, a0, 16 ; RV32I-NEXT: srli t1, a0, 24 ; RV32I-NEXT: srli t2, a0, 8 -; RV32I-NEXT: or a1, a7, a1 ; RV32I-NEXT: or a4, a6, a4 -; RV32I-NEXT: or a3, a5, a3 +; RV32I-NEXT: or a1, a5, a1 +; RV32I-NEXT: or a3, a3, a7 ; RV32I-NEXT: sb a0, 12(a2) ; RV32I-NEXT: sb t2, 13(a2) ; RV32I-NEXT: sb t0, 14(a2) @@ -1269,21 +1242,21 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: srli a0, a3, 16 ; RV32I-NEXT: srli a5, a3, 24 ; RV32I-NEXT: srli a6, a3, 8 -; RV32I-NEXT: srli a7, a4, 16 -; RV32I-NEXT: srli t0, a4, 24 -; RV32I-NEXT: srli t1, a4, 8 -; RV32I-NEXT: srli t2, a1, 16 -; RV32I-NEXT: srli t3, a1, 24 +; RV32I-NEXT: srli a7, a1, 16 +; RV32I-NEXT: srli t0, a1, 24 +; RV32I-NEXT: srli t1, a1, 8 +; RV32I-NEXT: srli t2, a4, 16 +; RV32I-NEXT: srli t3, a4, 24 ; RV32I-NEXT: sb a3, 8(a2) ; RV32I-NEXT: sb a6, 9(a2) ; RV32I-NEXT: sb a0, 10(a2) ; RV32I-NEXT: sb a5, 11(a2) -; RV32I-NEXT: srli a0, a1, 8 -; RV32I-NEXT: sb a4, 0(a2) +; RV32I-NEXT: srli a0, a4, 8 +; RV32I-NEXT: sb a1, 0(a2) ; RV32I-NEXT: sb t1, 1(a2) ; RV32I-NEXT: sb a7, 2(a2) ; RV32I-NEXT: sb t0, 3(a2) -; RV32I-NEXT: sb a1, 4(a2) +; RV32I-NEXT: sb a4, 4(a2) ; RV32I-NEXT: sb a0, 5(a2) ; RV32I-NEXT: sb t2, 6(a2) ; RV32I-NEXT: sb t3, 7(a2) @@ -1299,19 +1272,17 @@ define void @ashr_16bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: lshr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -160 -; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -144 +; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1328,143 +1299,122 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli s8, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a5, a4, a3 -; RV64I-NEXT: or a6, a6, s8 -; RV64I-NEXT: or a3, t0, a7 -; RV64I-NEXT: or a4, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t5, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s8, 22(a0) +; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t6, 24(a0) -; RV64I-NEXT: lbu s0, 25(a0) -; RV64I-NEXT: lbu s1, 26(a0) -; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: or t5, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s0, 24(a0) +; RV64I-NEXT: lbu s1, 25(a0) +; RV64I-NEXT: lbu s2, 26(a0) +; RV64I-NEXT: lbu s3, 27(a0) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s8, s8, 16 +; RV64I-NEXT: slli s9, s9, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or t6, s9, s8 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: slli s0, s0, 8 -; RV64I-NEXT: slli s1, s1, 16 -; RV64I-NEXT: slli s2, s2, 24 -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or a0, s11, s10 -; RV64I-NEXT: or t6, s0, t6 -; RV64I-NEXT: or s0, s2, s1 -; RV64I-NEXT: or s1, s4, s3 -; RV64I-NEXT: lbu s2, 0(a1) -; RV64I-NEXT: lbu s3, 1(a1) -; RV64I-NEXT: lbu s4, 2(a1) -; RV64I-NEXT: lbu s7, 3(a1) -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, s7, s4 -; RV64I-NEXT: lbu s4, 5(a1) -; RV64I-NEXT: lbu s6, 4(a1) -; RV64I-NEXT: lbu s7, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or s4, s4, s6 -; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: lbu a0, 0(a1) ; RV64I-NEXT: sd zero, 32(sp) ; RV64I-NEXT: sd zero, 40(sp) ; RV64I-NEXT: sd zero, 48(sp) ; RV64I-NEXT: sd zero, 56(sp) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: mv a6, sp +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a1, s3, s2 +; RV64I-NEXT: mv s2, sp +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: srli s3, a0, 3 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: andi s5, a0, 63 +; RV64I-NEXT: andi s3, s3, 24 +; RV64I-NEXT: xori s5, s5, 63 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: or a7, t2, t1 -; RV64I-NEXT: or t0, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: or t1, s0, t6 -; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s3, s2 -; RV64I-NEXT: or a1, a1, s4 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: slli t2, t2, 32 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a3, a3, a5 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: or a5, t2, t1 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or t1, s4, s1 +; RV64I-NEXT: add s2, s2, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a1, t1, a1 ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a0, 16(sp) -; RV64I-NEXT: sd a5, 24(sp) -; RV64I-NEXT: srli a0, a1, 3 -; RV64I-NEXT: andi a3, a1, 63 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: xori a3, a3, 63 -; RV64I-NEXT: add a0, a6, a0 -; RV64I-NEXT: ld a4, 8(a0) -; RV64I-NEXT: ld a5, 16(a0) -; RV64I-NEXT: ld a6, 0(a0) -; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: srl a7, a4, a1 +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a1, 24(sp) +; RV64I-NEXT: ld a1, 8(s2) +; RV64I-NEXT: ld a3, 16(s2) +; RV64I-NEXT: ld a4, 0(s2) +; RV64I-NEXT: ld a5, 24(s2) +; RV64I-NEXT: srl a6, a1, a0 +; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: srl a4, a4, a0 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: srl a3, a3, a0 ; RV64I-NEXT: slli t0, a5, 1 -; RV64I-NEXT: srl a6, a6, a1 -; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: slli t1, a0, 1 -; RV64I-NEXT: srl t2, a0, a1 -; RV64I-NEXT: sll a0, t0, a3 -; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: sll a3, t1, a3 -; RV64I-NEXT: srli a4, t2, 56 -; RV64I-NEXT: srli t0, t2, 48 -; RV64I-NEXT: srli t1, t2, 40 -; RV64I-NEXT: srli t3, t2, 32 -; RV64I-NEXT: srli t4, t2, 24 -; RV64I-NEXT: srli t5, t2, 16 -; RV64I-NEXT: srli t6, t2, 8 -; RV64I-NEXT: or a0, a7, a0 -; RV64I-NEXT: or a1, a6, a1 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: srl a5, a5, a0 +; RV64I-NEXT: sll a0, a7, s5 +; RV64I-NEXT: sll a1, a1, s5 +; RV64I-NEXT: sll a7, t0, s5 +; RV64I-NEXT: srli t0, a5, 56 +; RV64I-NEXT: srli t1, a5, 48 +; RV64I-NEXT: srli t2, a5, 40 +; RV64I-NEXT: srli t3, a5, 32 +; RV64I-NEXT: srli t4, a5, 24 +; RV64I-NEXT: srli t5, a5, 16 +; RV64I-NEXT: srli t6, a5, 8 +; RV64I-NEXT: or a0, a6, a0 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: or a3, a3, a7 ; RV64I-NEXT: sb t3, 28(a2) -; RV64I-NEXT: sb t1, 29(a2) -; RV64I-NEXT: sb t0, 30(a2) -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: sb t2, 24(a2) +; RV64I-NEXT: sb t2, 29(a2) +; RV64I-NEXT: sb t1, 30(a2) +; RV64I-NEXT: sb t0, 31(a2) +; RV64I-NEXT: sb a5, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) @@ -1513,19 +1463,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 9(a2) ; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 160 +; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 144 ; RV64I-NEXT: ret ; ; RV32I-LABEL: lshr_32bytes: @@ -1550,67 +1498,55 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu t3, 6(a0) +; RV32I-NEXT: lbu t6, 7(a0) +; RV32I-NEXT: lbu s2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s5, 11(a0) +; RV32I-NEXT: lbu s7, 12(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu s9, 14(a0) +; RV32I-NEXT: lbu s10, 15(a0) +; RV32I-NEXT: lbu s11, 16(a0) +; RV32I-NEXT: lbu ra, 17(a0) +; RV32I-NEXT: lbu t4, 18(a0) +; RV32I-NEXT: lbu s0, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 21(a0) +; RV32I-NEXT: lbu t5, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t6, t3 +; RV32I-NEXT: or a7, s3, s2 +; RV32I-NEXT: or t0, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s5, 25(a0) +; RV32I-NEXT: lbu s6, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s9, s9, 16 +; RV32I-NEXT: slli s10, s10, 24 +; RV32I-NEXT: slli ra, ra, 8 +; RV32I-NEXT: or s7, s8, s7 +; RV32I-NEXT: or s2, s10, s9 +; RV32I-NEXT: or s3, ra, s11 +; RV32I-NEXT: lbu s4, 28(a0) +; RV32I-NEXT: lbu s8, 29(a0) +; RV32I-NEXT: lbu s9, 30(a0) +; RV32I-NEXT: lbu s10, 31(a0) +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: sw zero, 56(sp) ; RV32I-NEXT: sw zero, 60(sp) ; RV32I-NEXT: sw zero, 64(sp) @@ -1619,89 +1555,90 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw zero, 44(sp) ; RV32I-NEXT: sw zero, 48(sp) ; RV32I-NEXT: sw zero, 52(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 8 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw t2, 32(sp) -; RV32I-NEXT: sw t3, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or t4, s0, t4 +; RV32I-NEXT: addi s0, sp, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s9, s9, 16 +; RV32I-NEXT: slli s10, s10, 24 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: srli a1, a0, 3 -; RV32I-NEXT: andi a3, a0, 31 -; RV32I-NEXT: andi a4, a1, 28 -; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s3, a4 -; RV32I-NEXT: lw a3, 0(a4) -; RV32I-NEXT: lw a5, 4(a4) -; RV32I-NEXT: lw a6, 8(a4) -; RV32I-NEXT: lw a7, 12(a4) -; RV32I-NEXT: lw t0, 16(a4) -; RV32I-NEXT: lw t1, 20(a4) -; RV32I-NEXT: lw t2, 24(a4) -; RV32I-NEXT: lw a4, 28(a4) -; RV32I-NEXT: srl t3, a5, a0 -; RV32I-NEXT: slli t4, a6, 1 +; RV32I-NEXT: or t2, s1, t5 +; RV32I-NEXT: andi t5, a0, 31 +; RV32I-NEXT: or t3, s5, t3 +; RV32I-NEXT: or t6, t6, s6 +; RV32I-NEXT: or s1, s8, s4 +; RV32I-NEXT: or s4, s10, s9 +; RV32I-NEXT: andi s5, a1, 28 +; RV32I-NEXT: xori a1, t5, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, s2, s7 +; RV32I-NEXT: or a7, t4, s3 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t1, t6, t3 +; RV32I-NEXT: or t2, s4, s1 +; RV32I-NEXT: add s0, s0, s5 +; RV32I-NEXT: sw a7, 24(sp) +; RV32I-NEXT: sw t0, 28(sp) +; RV32I-NEXT: sw t1, 32(sp) +; RV32I-NEXT: sw t2, 36(sp) +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a5, 16(sp) +; RV32I-NEXT: sw a6, 20(sp) +; RV32I-NEXT: lw a3, 0(s0) +; RV32I-NEXT: lw a4, 4(s0) +; RV32I-NEXT: lw a5, 8(s0) +; RV32I-NEXT: lw a6, 12(s0) +; RV32I-NEXT: lw a7, 16(s0) +; RV32I-NEXT: lw t0, 20(s0) +; RV32I-NEXT: lw t1, 24(s0) +; RV32I-NEXT: lw t2, 28(s0) +; RV32I-NEXT: srl t3, a4, a0 +; RV32I-NEXT: slli t4, a5, 1 ; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: srl t5, a7, a0 -; RV32I-NEXT: slli t6, t0, 1 -; RV32I-NEXT: srl a6, a6, a0 -; RV32I-NEXT: slli a7, a7, 1 -; RV32I-NEXT: srl s0, t1, a0 -; RV32I-NEXT: slli s1, t2, 1 -; RV32I-NEXT: srl t0, t0, a0 -; RV32I-NEXT: slli t1, t1, 1 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl t5, a6, a0 +; RV32I-NEXT: slli t6, a7, 1 +; RV32I-NEXT: srl a5, a5, a0 +; RV32I-NEXT: slli a6, a6, 1 +; RV32I-NEXT: srl s0, t0, a0 +; RV32I-NEXT: slli s1, t1, 1 +; RV32I-NEXT: srl a7, a7, a0 +; RV32I-NEXT: slli t0, t0, 1 +; RV32I-NEXT: srl t1, t1, a0 +; RV32I-NEXT: slli s2, t2, 1 ; RV32I-NEXT: srl t2, t2, a0 -; RV32I-NEXT: slli s2, a4, 1 -; RV32I-NEXT: srl s3, a4, a0 ; RV32I-NEXT: sll a0, t4, a1 -; RV32I-NEXT: sll a4, a5, a1 -; RV32I-NEXT: sll a5, t6, a1 -; RV32I-NEXT: sll a7, a7, a1 -; RV32I-NEXT: sll t4, s1, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t6, s2, a1 -; RV32I-NEXT: srli s1, s3, 24 -; RV32I-NEXT: srli s2, s3, 16 -; RV32I-NEXT: srli s4, s3, 8 +; RV32I-NEXT: sll a4, a4, a1 +; RV32I-NEXT: sll t4, t6, a1 +; RV32I-NEXT: sll a6, a6, a1 +; RV32I-NEXT: sll t6, s1, a1 +; RV32I-NEXT: sll t0, t0, a1 +; RV32I-NEXT: sll s1, s2, a1 +; RV32I-NEXT: srli s2, t2, 24 +; RV32I-NEXT: srli s3, t2, 16 +; RV32I-NEXT: srli s4, t2, 8 ; RV32I-NEXT: or a0, t3, a0 ; RV32I-NEXT: or a1, a3, a4 -; RV32I-NEXT: or a3, t5, a5 -; RV32I-NEXT: or a4, a6, a7 -; RV32I-NEXT: or a5, s0, t4 -; RV32I-NEXT: or a6, t0, t1 -; RV32I-NEXT: or a7, t2, t6 -; RV32I-NEXT: sb s3, 28(a2) +; RV32I-NEXT: or a3, t5, t4 +; RV32I-NEXT: or a4, a5, a6 +; RV32I-NEXT: or a5, s0, t6 +; RV32I-NEXT: or a6, a7, t0 +; RV32I-NEXT: or a7, t1, s1 +; RV32I-NEXT: sb t2, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) -; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb s1, 31(a2) +; RV32I-NEXT: sb s3, 30(a2) +; RV32I-NEXT: sb s2, 31(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 ; RV32I-NEXT: srli t2, a7, 8 @@ -1775,19 +1712,17 @@ define void @lshr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: shl_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -160 -; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -144 +; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -1804,146 +1739,125 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli s8, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a5, a4, a3 -; RV64I-NEXT: or a6, a6, s8 -; RV64I-NEXT: or a3, t0, a7 -; RV64I-NEXT: or a4, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t5, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s8, 22(a0) +; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t6, 24(a0) -; RV64I-NEXT: lbu s0, 25(a0) -; RV64I-NEXT: lbu s1, 26(a0) -; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: or t5, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s0, 24(a0) +; RV64I-NEXT: lbu s1, 25(a0) +; RV64I-NEXT: lbu s2, 26(a0) +; RV64I-NEXT: lbu s3, 27(a0) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s8, s8, 16 +; RV64I-NEXT: slli s9, s9, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or t6, s9, s8 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: slli s0, s0, 8 -; RV64I-NEXT: slli s1, s1, 16 -; RV64I-NEXT: slli s2, s2, 24 -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or a0, s11, s10 -; RV64I-NEXT: or t6, s0, t6 -; RV64I-NEXT: or s0, s2, s1 -; RV64I-NEXT: or s1, s4, s3 -; RV64I-NEXT: lbu s2, 0(a1) -; RV64I-NEXT: lbu s3, 1(a1) -; RV64I-NEXT: lbu s4, 2(a1) -; RV64I-NEXT: lbu s7, 3(a1) -; RV64I-NEXT: slli s5, s5, 16 -; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, s7, s4 -; RV64I-NEXT: lbu s4, 5(a1) -; RV64I-NEXT: lbu s6, 4(a1) -; RV64I-NEXT: lbu s7, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or s4, s4, s6 -; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, s7 +; RV64I-NEXT: lbu a0, 0(a1) ; RV64I-NEXT: sd zero, 0(sp) ; RV64I-NEXT: sd zero, 8(sp) ; RV64I-NEXT: sd zero, 16(sp) ; RV64I-NEXT: sd zero, 24(sp) -; RV64I-NEXT: or a5, a6, a5 -; RV64I-NEXT: addi a6, sp, 32 +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a1, s3, s2 +; RV64I-NEXT: addi s2, sp, 32 +; RV64I-NEXT: slli s4, s4, 8 +; RV64I-NEXT: slli s5, s5, 16 +; RV64I-NEXT: slli s6, s6, 24 +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: srli s3, a0, 3 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: andi s5, a0, 63 +; RV64I-NEXT: andi s3, s3, 24 ; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, t0, a7 -; RV64I-NEXT: or a7, t2, t1 -; RV64I-NEXT: or t0, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: or t1, s0, t6 -; RV64I-NEXT: or t2, s5, s1 -; RV64I-NEXT: or t3, s3, s2 -; RV64I-NEXT: or a1, a1, s4 -; RV64I-NEXT: slli a3, a3, 32 -; RV64I-NEXT: slli a7, a7, 32 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: slli t2, t2, 32 -; RV64I-NEXT: slli a1, a1, 32 -; RV64I-NEXT: or a3, a3, a5 -; RV64I-NEXT: or a4, a7, a4 -; RV64I-NEXT: or a0, a0, t0 -; RV64I-NEXT: or a5, t2, t1 -; RV64I-NEXT: or a1, a1, t3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or t1, s4, s1 +; RV64I-NEXT: sub t2, s2, s3 +; RV64I-NEXT: slli a4, a4, 32 +; RV64I-NEXT: slli a6, a6, 32 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t1, t1, 32 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a1, t1, a1 ; RV64I-NEXT: sd a3, 32(sp) ; RV64I-NEXT: sd a4, 40(sp) -; RV64I-NEXT: sd a0, 48(sp) -; RV64I-NEXT: sd a5, 56(sp) -; RV64I-NEXT: srli a0, a1, 3 -; RV64I-NEXT: andi a3, a1, 63 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: sub a0, a6, a0 -; RV64I-NEXT: ld a4, 0(a0) -; RV64I-NEXT: ld a5, 8(a0) -; RV64I-NEXT: ld a6, 16(a0) -; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: xori a3, a3, 63 -; RV64I-NEXT: sll a7, a5, a1 -; RV64I-NEXT: srli t0, a4, 1 -; RV64I-NEXT: sll t1, a0, a1 -; RV64I-NEXT: srli a0, a6, 1 -; RV64I-NEXT: sll a6, a6, a1 -; RV64I-NEXT: srli a5, a5, 1 -; RV64I-NEXT: sll a4, a4, a1 -; RV64I-NEXT: srl a1, t0, a3 -; RV64I-NEXT: srl t0, a0, a3 -; RV64I-NEXT: srl a3, a5, a3 -; RV64I-NEXT: srli a5, a4, 56 -; RV64I-NEXT: srli t2, a4, 48 -; RV64I-NEXT: srli t3, a4, 40 -; RV64I-NEXT: srli t4, a4, 32 -; RV64I-NEXT: srli t5, a4, 24 -; RV64I-NEXT: srli t6, a4, 16 -; RV64I-NEXT: srli s0, a4, 8 -; RV64I-NEXT: or a0, a7, a1 -; RV64I-NEXT: or a1, t1, t0 -; RV64I-NEXT: or a3, a6, a3 -; RV64I-NEXT: sb t4, 4(a2) -; RV64I-NEXT: sb t3, 5(a2) -; RV64I-NEXT: sb t2, 6(a2) -; RV64I-NEXT: sb a5, 7(a2) -; RV64I-NEXT: sb a4, 0(a2) -; RV64I-NEXT: sb s0, 1(a2) -; RV64I-NEXT: sb t6, 2(a2) -; RV64I-NEXT: sb t5, 3(a2) +; RV64I-NEXT: sd a5, 48(sp) +; RV64I-NEXT: sd a1, 56(sp) +; RV64I-NEXT: ld a1, 0(t2) +; RV64I-NEXT: ld a3, 8(t2) +; RV64I-NEXT: ld a4, 16(t2) +; RV64I-NEXT: ld a5, 24(t2) +; RV64I-NEXT: xori a6, s5, 63 +; RV64I-NEXT: sll a7, a3, a0 +; RV64I-NEXT: srli t0, a1, 1 +; RV64I-NEXT: sll a5, a5, a0 +; RV64I-NEXT: srli t1, a4, 1 +; RV64I-NEXT: sll a4, a4, a0 +; RV64I-NEXT: srli a3, a3, 1 +; RV64I-NEXT: sll t2, a1, a0 +; RV64I-NEXT: srl a0, t0, a6 +; RV64I-NEXT: srl a1, t1, a6 +; RV64I-NEXT: srl a3, a3, a6 +; RV64I-NEXT: srli a6, t2, 56 +; RV64I-NEXT: srli t0, t2, 48 +; RV64I-NEXT: srli t1, t2, 40 +; RV64I-NEXT: srli t3, t2, 32 +; RV64I-NEXT: srli t4, t2, 24 +; RV64I-NEXT: srli t5, t2, 16 +; RV64I-NEXT: srli t6, t2, 8 +; RV64I-NEXT: or a0, a7, a0 +; RV64I-NEXT: or a1, a5, a1 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: sb t3, 4(a2) +; RV64I-NEXT: sb t1, 5(a2) +; RV64I-NEXT: sb t0, 6(a2) +; RV64I-NEXT: sb a6, 7(a2) +; RV64I-NEXT: sb t2, 0(a2) +; RV64I-NEXT: sb t6, 1(a2) +; RV64I-NEXT: sb t5, 2(a2) +; RV64I-NEXT: sb t4, 3(a2) ; RV64I-NEXT: srli a4, a3, 56 ; RV64I-NEXT: srli a5, a3, 48 ; RV64I-NEXT: srli a6, a3, 40 @@ -1989,19 +1903,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: sb a1, 9(a2) ; RV64I-NEXT: sb a5, 10(a2) ; RV64I-NEXT: sb a3, 11(a2) -; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 160 +; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 144 ; RV64I-NEXT: ret ; ; RV32I-LABEL: shl_32bytes: @@ -2026,67 +1938,55 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s2, 13(a0) -; RV32I-NEXT: lbu s4, 14(a0) -; RV32I-NEXT: lbu s5, 15(a0) -; RV32I-NEXT: lbu s6, 16(a0) -; RV32I-NEXT: lbu s7, 17(a0) -; RV32I-NEXT: lbu s8, 18(a0) -; RV32I-NEXT: lbu s9, 19(a0) +; RV32I-NEXT: lbu t3, 6(a0) +; RV32I-NEXT: lbu t6, 7(a0) +; RV32I-NEXT: lbu s2, 8(a0) +; RV32I-NEXT: lbu s3, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s5, 11(a0) +; RV32I-NEXT: lbu s7, 12(a0) +; RV32I-NEXT: lbu s8, 13(a0) +; RV32I-NEXT: lbu s9, 14(a0) +; RV32I-NEXT: lbu s10, 15(a0) +; RV32I-NEXT: lbu s11, 16(a0) +; RV32I-NEXT: lbu ra, 17(a0) +; RV32I-NEXT: lbu t4, 18(a0) +; RV32I-NEXT: lbu s0, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s10, 20(a0) -; RV32I-NEXT: lbu s11, 21(a0) -; RV32I-NEXT: lbu ra, 22(a0) -; RV32I-NEXT: lbu a3, 23(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 21(a0) +; RV32I-NEXT: lbu t5, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli t3, t3, 16 ; RV32I-NEXT: slli t6, t6, 24 -; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu s1, 24(a0) -; RV32I-NEXT: lbu s3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s2, s2, 8 +; RV32I-NEXT: slli s3, s3, 8 ; RV32I-NEXT: slli s4, s4, 16 ; RV32I-NEXT: slli s5, s5, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, s5, s4 -; RV32I-NEXT: or t3, s7, s6 -; RV32I-NEXT: lbu t6, 28(a0) -; RV32I-NEXT: lbu s4, 29(a0) -; RV32I-NEXT: lbu s5, 30(a0) -; RV32I-NEXT: lbu s6, 31(a0) -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli s9, s9, 24 -; RV32I-NEXT: slli s11, s11, 8 -; RV32I-NEXT: slli ra, ra, 16 -; RV32I-NEXT: slli a3, a3, 24 -; RV32I-NEXT: or a0, s9, s8 -; RV32I-NEXT: or s0, s11, s10 -; RV32I-NEXT: or s2, a3, ra -; RV32I-NEXT: lbu a3, 0(a1) -; RV32I-NEXT: lbu s7, 1(a1) -; RV32I-NEXT: lbu s8, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) +; RV32I-NEXT: or a5, t0, a5 +; RV32I-NEXT: or a6, t6, t3 +; RV32I-NEXT: or a7, s3, s2 +; RV32I-NEXT: or t0, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s5, 25(a0) +; RV32I-NEXT: lbu s6, 26(a0) +; RV32I-NEXT: lbu t6, 27(a0) +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s9, s9, 16 +; RV32I-NEXT: slli s10, s10, 24 +; RV32I-NEXT: slli ra, ra, 8 +; RV32I-NEXT: or s7, s8, s7 +; RV32I-NEXT: or s2, s10, s9 +; RV32I-NEXT: or s3, ra, s11 +; RV32I-NEXT: lbu s4, 28(a0) +; RV32I-NEXT: lbu s8, 29(a0) +; RV32I-NEXT: lbu s9, 30(a0) +; RV32I-NEXT: lbu s10, 31(a0) +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: sw zero, 24(sp) ; RV32I-NEXT: sw zero, 28(sp) ; RV32I-NEXT: sw zero, 32(sp) @@ -2095,88 +1995,89 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: sw zero, 12(sp) ; RV32I-NEXT: sw zero, 16(sp) ; RV32I-NEXT: sw zero, 20(sp) -; RV32I-NEXT: slli s3, s3, 8 -; RV32I-NEXT: or s1, s3, s1 -; RV32I-NEXT: addi s3, sp, 40 ; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s4, s4, 8 -; RV32I-NEXT: slli s5, s5, 16 -; RV32I-NEXT: slli s6, s6, 24 -; RV32I-NEXT: slli s7, s7, 8 -; RV32I-NEXT: slli s8, s8, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s4, t6 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a3, s7, a3 -; RV32I-NEXT: or a1, a1, s8 -; RV32I-NEXT: lw s4, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, s4 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, a0, t3 -; RV32I-NEXT: or t1, s2, s0 -; RV32I-NEXT: or t2, t4, s1 -; RV32I-NEXT: or t3, t6, t5 -; RV32I-NEXT: or a0, a1, a3 -; RV32I-NEXT: sw t0, 56(sp) -; RV32I-NEXT: sw t1, 60(sp) -; RV32I-NEXT: sw t2, 64(sp) -; RV32I-NEXT: sw t3, 68(sp) -; RV32I-NEXT: sw a4, 40(sp) -; RV32I-NEXT: sw a5, 44(sp) -; RV32I-NEXT: sw a6, 48(sp) -; RV32I-NEXT: sw a7, 52(sp) +; RV32I-NEXT: slli s0, s0, 24 +; RV32I-NEXT: or t4, s0, t4 +; RV32I-NEXT: addi s0, sp, 40 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s5, s5, 8 +; RV32I-NEXT: slli s6, s6, 16 +; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli s8, s8, 8 +; RV32I-NEXT: slli s9, s9, 16 +; RV32I-NEXT: slli s10, s10, 24 +; RV32I-NEXT: or t1, t2, t1 ; RV32I-NEXT: srli a1, a0, 3 -; RV32I-NEXT: andi a3, a0, 31 -; RV32I-NEXT: andi a4, a1, 28 -; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: sub a3, s3, a4 -; RV32I-NEXT: lw a4, 0(a3) -; RV32I-NEXT: lw a5, 4(a3) -; RV32I-NEXT: lw a6, 8(a3) -; RV32I-NEXT: lw a7, 12(a3) -; RV32I-NEXT: lw t0, 16(a3) -; RV32I-NEXT: lw t1, 20(a3) -; RV32I-NEXT: lw t2, 24(a3) -; RV32I-NEXT: lw a3, 28(a3) -; RV32I-NEXT: sll t3, a5, a0 -; RV32I-NEXT: srli t4, a4, 1 -; RV32I-NEXT: sll t5, a7, a0 -; RV32I-NEXT: srli t6, a6, 1 -; RV32I-NEXT: sll a6, a6, a0 -; RV32I-NEXT: srli a5, a5, 1 -; RV32I-NEXT: sll s0, t1, a0 -; RV32I-NEXT: srli s1, t0, 1 -; RV32I-NEXT: sll t0, t0, a0 -; RV32I-NEXT: srli a7, a7, 1 -; RV32I-NEXT: sll s2, a3, a0 -; RV32I-NEXT: srli a3, t2, 1 +; RV32I-NEXT: or t2, s1, t5 +; RV32I-NEXT: andi t5, a0, 31 +; RV32I-NEXT: or t3, s5, t3 +; RV32I-NEXT: or t6, t6, s6 +; RV32I-NEXT: or s1, s8, s4 +; RV32I-NEXT: or s4, s10, s9 +; RV32I-NEXT: andi s5, a1, 28 +; RV32I-NEXT: xori a1, t5, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, s2, s7 +; RV32I-NEXT: or a7, t4, s3 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t1, t6, t3 +; RV32I-NEXT: or t2, s4, s1 +; RV32I-NEXT: sub t3, s0, s5 +; RV32I-NEXT: sw a7, 56(sp) +; RV32I-NEXT: sw t0, 60(sp) +; RV32I-NEXT: sw t1, 64(sp) +; RV32I-NEXT: sw t2, 68(sp) +; RV32I-NEXT: sw a3, 40(sp) +; RV32I-NEXT: sw a4, 44(sp) +; RV32I-NEXT: sw a5, 48(sp) +; RV32I-NEXT: sw a6, 52(sp) +; RV32I-NEXT: lw a3, 0(t3) +; RV32I-NEXT: lw a4, 4(t3) +; RV32I-NEXT: lw a5, 8(t3) +; RV32I-NEXT: lw a6, 12(t3) +; RV32I-NEXT: lw a7, 16(t3) +; RV32I-NEXT: lw t0, 20(t3) +; RV32I-NEXT: lw t1, 24(t3) +; RV32I-NEXT: lw t2, 28(t3) +; RV32I-NEXT: sll t3, a4, a0 +; RV32I-NEXT: srli t4, a3, 1 +; RV32I-NEXT: sll t5, a6, a0 +; RV32I-NEXT: srli t6, a5, 1 +; RV32I-NEXT: sll a5, a5, a0 +; RV32I-NEXT: srli a4, a4, 1 +; RV32I-NEXT: sll s0, t0, a0 +; RV32I-NEXT: srli s1, a7, 1 +; RV32I-NEXT: sll a7, a7, a0 +; RV32I-NEXT: srli a6, a6, 1 ; RV32I-NEXT: sll t2, t2, a0 -; RV32I-NEXT: srli t1, t1, 1 -; RV32I-NEXT: sll s3, a4, a0 +; RV32I-NEXT: srli s2, t1, 1 +; RV32I-NEXT: sll t1, t1, a0 +; RV32I-NEXT: srli t0, t0, 1 +; RV32I-NEXT: sll s3, a3, a0 ; RV32I-NEXT: srl a0, t4, a1 -; RV32I-NEXT: srl a4, t6, a1 -; RV32I-NEXT: srl a5, a5, a1 +; RV32I-NEXT: srl a3, t6, a1 +; RV32I-NEXT: srl a4, a4, a1 ; RV32I-NEXT: srl t4, s1, a1 -; RV32I-NEXT: srl a7, a7, a1 -; RV32I-NEXT: srl t6, a3, a1 -; RV32I-NEXT: srl t1, t1, a1 +; RV32I-NEXT: srl a6, a6, a1 +; RV32I-NEXT: srl t6, s2, a1 +; RV32I-NEXT: srl t0, t0, a1 ; RV32I-NEXT: srli s1, s3, 24 -; RV32I-NEXT: srli s4, s3, 16 -; RV32I-NEXT: srli s5, s3, 8 +; RV32I-NEXT: srli s2, s3, 16 +; RV32I-NEXT: srli s4, s3, 8 ; RV32I-NEXT: or a0, t3, a0 -; RV32I-NEXT: or a1, t5, a4 -; RV32I-NEXT: or a3, a6, a5 +; RV32I-NEXT: or a1, t5, a3 +; RV32I-NEXT: or a3, a5, a4 ; RV32I-NEXT: or a4, s0, t4 -; RV32I-NEXT: or a5, t0, a7 -; RV32I-NEXT: or a6, s2, t6 -; RV32I-NEXT: or a7, t2, t1 +; RV32I-NEXT: or a5, a7, a6 +; RV32I-NEXT: or a6, t2, t6 +; RV32I-NEXT: or a7, t1, t0 ; RV32I-NEXT: sb s3, 0(a2) -; RV32I-NEXT: sb s5, 1(a2) -; RV32I-NEXT: sb s4, 2(a2) +; RV32I-NEXT: sb s4, 1(a2) +; RV32I-NEXT: sb s2, 2(a2) ; RV32I-NEXT: sb s1, 3(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 @@ -2251,19 +2152,17 @@ define void @shl_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-LABEL: ashr_32bytes: ; RV64I: # %bb.0: -; RV64I-NEXT: addi sp, sp, -160 -; RV64I-NEXT: sd s0, 152(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s1, 144(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s2, 136(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s3, 128(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s4, 120(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s5, 112(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s6, 104(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s7, 96(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s8, 88(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s9, 80(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s10, 72(sp) # 8-byte Folded Spill -; RV64I-NEXT: sd s11, 64(sp) # 8-byte Folded Spill +; RV64I-NEXT: addi sp, sp, -144 +; RV64I-NEXT: sd s0, 136(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s1, 128(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s2, 120(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s3, 112(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s4, 104(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s5, 96(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s6, 88(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s7, 80(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s8, 72(sp) # 8-byte Folded Spill +; RV64I-NEXT: sd s9, 64(sp) # 8-byte Folded Spill ; RV64I-NEXT: lbu a3, 0(a0) ; RV64I-NEXT: lbu a4, 1(a0) ; RV64I-NEXT: lbu a5, 2(a0) @@ -2280,144 +2179,123 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: lbu s1, 13(a0) ; RV64I-NEXT: lbu s2, 14(a0) ; RV64I-NEXT: lbu s3, 15(a0) +; RV64I-NEXT: slli a4, a4, 8 +; RV64I-NEXT: slli a5, a5, 16 +; RV64I-NEXT: slli a6, a6, 24 +; RV64I-NEXT: or a3, a4, a3 +; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: lbu s4, 16(a0) ; RV64I-NEXT: lbu s5, 17(a0) ; RV64I-NEXT: lbu s6, 18(a0) ; RV64I-NEXT: lbu s7, 19(a0) -; RV64I-NEXT: slli a4, a4, 8 -; RV64I-NEXT: slli a5, a5, 16 -; RV64I-NEXT: slli a6, a6, 24 ; RV64I-NEXT: slli t0, t0, 8 ; RV64I-NEXT: slli t1, t1, 16 ; RV64I-NEXT: slli t2, t2, 24 -; RV64I-NEXT: or a3, a4, a3 -; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a5, t0, a7 -; RV64I-NEXT: or a6, t2, t1 -; RV64I-NEXT: lbu s8, 20(a0) -; RV64I-NEXT: lbu s9, 21(a0) -; RV64I-NEXT: lbu s10, 22(a0) -; RV64I-NEXT: lbu s11, 23(a0) ; RV64I-NEXT: slli t4, t4, 8 ; RV64I-NEXT: slli t5, t5, 16 ; RV64I-NEXT: slli t6, t6, 24 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a6, t2, t1 +; RV64I-NEXT: or a7, t4, t3 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: lbu t5, 20(a0) +; RV64I-NEXT: lbu t6, 21(a0) +; RV64I-NEXT: lbu s8, 22(a0) +; RV64I-NEXT: lbu s9, 23(a0) ; RV64I-NEXT: slli s1, s1, 8 ; RV64I-NEXT: slli s2, s2, 16 ; RV64I-NEXT: slli s3, s3, 24 -; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or t0, t6, t5 -; RV64I-NEXT: or t1, s1, s0 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: lbu t6, 24(a0) -; RV64I-NEXT: lbu s0, 25(a0) -; RV64I-NEXT: lbu s1, 26(a0) -; RV64I-NEXT: lbu s2, 27(a0) ; RV64I-NEXT: slli s5, s5, 8 ; RV64I-NEXT: slli s6, s6, 16 ; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: slli s9, s9, 8 +; RV64I-NEXT: or t1, s1, s0 +; RV64I-NEXT: or t2, s3, s2 ; RV64I-NEXT: or t3, s5, s4 ; RV64I-NEXT: or t4, s7, s6 -; RV64I-NEXT: or t5, s9, s8 -; RV64I-NEXT: lbu s3, 28(a0) +; RV64I-NEXT: lbu s0, 24(a0) +; RV64I-NEXT: lbu s1, 25(a0) +; RV64I-NEXT: lbu s2, 26(a0) +; RV64I-NEXT: lbu s3, 27(a0) +; RV64I-NEXT: slli t6, t6, 8 +; RV64I-NEXT: slli s8, s8, 16 +; RV64I-NEXT: slli s9, s9, 24 +; RV64I-NEXT: slli s1, s1, 8 +; RV64I-NEXT: or t5, t6, t5 +; RV64I-NEXT: or t6, s9, s8 +; RV64I-NEXT: or s0, s1, s0 +; RV64I-NEXT: lbu s1, 28(a0) ; RV64I-NEXT: lbu s4, 29(a0) ; RV64I-NEXT: lbu s5, 30(a0) ; RV64I-NEXT: lbu s6, 31(a0) -; RV64I-NEXT: slli s10, s10, 16 -; RV64I-NEXT: slli s11, s11, 24 -; RV64I-NEXT: slli s0, s0, 8 -; RV64I-NEXT: slli s1, s1, 16 -; RV64I-NEXT: slli s2, s2, 24 +; RV64I-NEXT: lbu a0, 0(a1) +; RV64I-NEXT: slli s2, s2, 16 +; RV64I-NEXT: slli s3, s3, 24 +; RV64I-NEXT: or a1, s3, s2 +; RV64I-NEXT: mv s2, sp ; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or a0, s11, s10 -; RV64I-NEXT: or t6, s0, t6 -; RV64I-NEXT: or s0, s2, s1 -; RV64I-NEXT: or s1, s4, s3 -; RV64I-NEXT: lbu s2, 0(a1) -; RV64I-NEXT: lbu s3, 1(a1) -; RV64I-NEXT: lbu s4, 2(a1) -; RV64I-NEXT: lbu s7, 3(a1) ; RV64I-NEXT: slli s5, s5, 16 ; RV64I-NEXT: slli s6, s6, 24 -; RV64I-NEXT: slli s3, s3, 8 -; RV64I-NEXT: slli s4, s4, 16 -; RV64I-NEXT: slli s7, s7, 24 -; RV64I-NEXT: or s5, s6, s5 -; RV64I-NEXT: or s2, s3, s2 -; RV64I-NEXT: or s3, s7, s4 -; RV64I-NEXT: lbu s4, 5(a1) -; RV64I-NEXT: lbu s6, 4(a1) -; RV64I-NEXT: lbu s7, 6(a1) -; RV64I-NEXT: lbu a1, 7(a1) -; RV64I-NEXT: slli s4, s4, 8 -; RV64I-NEXT: or s4, s4, s6 -; RV64I-NEXT: slli s7, s7, 16 -; RV64I-NEXT: slli a1, a1, 24 -; RV64I-NEXT: or a1, a1, s7 -; RV64I-NEXT: mv s6, sp +; RV64I-NEXT: or s1, s4, s1 +; RV64I-NEXT: srli s3, a0, 3 +; RV64I-NEXT: or s4, s6, s5 +; RV64I-NEXT: andi s5, a0, 63 +; RV64I-NEXT: andi s3, s3, 24 +; RV64I-NEXT: xori s5, s5, 63 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 ; RV64I-NEXT: or a5, t0, a7 ; RV64I-NEXT: or a6, t2, t1 ; RV64I-NEXT: or a7, t4, t3 -; RV64I-NEXT: or a0, a0, t5 -; RV64I-NEXT: or t0, s0, t6 -; RV64I-NEXT: or t1, s5, s1 -; RV64I-NEXT: or t2, s3, s2 -; RV64I-NEXT: or a1, a1, s4 +; RV64I-NEXT: or t0, t6, t5 +; RV64I-NEXT: or a1, a1, s0 +; RV64I-NEXT: or t1, s4, s1 +; RV64I-NEXT: add s2, s2, s3 ; RV64I-NEXT: slli a4, a4, 32 ; RV64I-NEXT: slli a6, a6, 32 -; RV64I-NEXT: slli a0, a0, 32 -; RV64I-NEXT: slli t3, t1, 32 -; RV64I-NEXT: slli a1, a1, 32 +; RV64I-NEXT: slli t0, t0, 32 +; RV64I-NEXT: slli t2, t1, 32 ; RV64I-NEXT: sraiw t1, t1, 31 ; RV64I-NEXT: or a3, a4, a3 ; RV64I-NEXT: or a4, a6, a5 -; RV64I-NEXT: or a0, a0, a7 -; RV64I-NEXT: or a5, t3, t0 -; RV64I-NEXT: or a1, a1, t2 +; RV64I-NEXT: or a5, t0, a7 +; RV64I-NEXT: or a1, t2, a1 ; RV64I-NEXT: sd t1, 32(sp) ; RV64I-NEXT: sd t1, 40(sp) ; RV64I-NEXT: sd t1, 48(sp) ; RV64I-NEXT: sd t1, 56(sp) ; RV64I-NEXT: sd a3, 0(sp) ; RV64I-NEXT: sd a4, 8(sp) -; RV64I-NEXT: sd a0, 16(sp) -; RV64I-NEXT: sd a5, 24(sp) -; RV64I-NEXT: srli a0, a1, 3 -; RV64I-NEXT: andi a3, a1, 63 -; RV64I-NEXT: andi a0, a0, 24 -; RV64I-NEXT: xori a3, a3, 63 -; RV64I-NEXT: add a0, s6, a0 -; RV64I-NEXT: ld a4, 8(a0) -; RV64I-NEXT: ld a5, 16(a0) -; RV64I-NEXT: ld a6, 0(a0) -; RV64I-NEXT: ld a0, 24(a0) -; RV64I-NEXT: srl a7, a4, a1 +; RV64I-NEXT: sd a5, 16(sp) +; RV64I-NEXT: sd a1, 24(sp) +; RV64I-NEXT: ld a1, 8(s2) +; RV64I-NEXT: ld a3, 16(s2) +; RV64I-NEXT: ld a4, 0(s2) +; RV64I-NEXT: ld a5, 24(s2) +; RV64I-NEXT: srl a6, a1, a0 +; RV64I-NEXT: slli a7, a3, 1 +; RV64I-NEXT: srl a4, a4, a0 +; RV64I-NEXT: slli a1, a1, 1 +; RV64I-NEXT: srl a3, a3, a0 ; RV64I-NEXT: slli t0, a5, 1 -; RV64I-NEXT: srl a6, a6, a1 -; RV64I-NEXT: slli a4, a4, 1 -; RV64I-NEXT: srl a5, a5, a1 -; RV64I-NEXT: slli t1, a0, 1 -; RV64I-NEXT: sra t2, a0, a1 -; RV64I-NEXT: sll a0, t0, a3 -; RV64I-NEXT: sll a1, a4, a3 -; RV64I-NEXT: sll a3, t1, a3 -; RV64I-NEXT: srli a4, t2, 56 -; RV64I-NEXT: srli t0, t2, 48 -; RV64I-NEXT: srli t1, t2, 40 -; RV64I-NEXT: srli t3, t2, 32 -; RV64I-NEXT: srli t4, t2, 24 -; RV64I-NEXT: srli t5, t2, 16 -; RV64I-NEXT: srli t6, t2, 8 -; RV64I-NEXT: or a0, a7, a0 -; RV64I-NEXT: or a1, a6, a1 -; RV64I-NEXT: or a3, a5, a3 +; RV64I-NEXT: sra a5, a5, a0 +; RV64I-NEXT: sll a0, a7, s5 +; RV64I-NEXT: sll a1, a1, s5 +; RV64I-NEXT: sll a7, t0, s5 +; RV64I-NEXT: srli t0, a5, 56 +; RV64I-NEXT: srli t1, a5, 48 +; RV64I-NEXT: srli t2, a5, 40 +; RV64I-NEXT: srli t3, a5, 32 +; RV64I-NEXT: srli t4, a5, 24 +; RV64I-NEXT: srli t5, a5, 16 +; RV64I-NEXT: srli t6, a5, 8 +; RV64I-NEXT: or a0, a6, a0 +; RV64I-NEXT: or a1, a4, a1 +; RV64I-NEXT: or a3, a3, a7 ; RV64I-NEXT: sb t3, 28(a2) -; RV64I-NEXT: sb t1, 29(a2) -; RV64I-NEXT: sb t0, 30(a2) -; RV64I-NEXT: sb a4, 31(a2) -; RV64I-NEXT: sb t2, 24(a2) +; RV64I-NEXT: sb t2, 29(a2) +; RV64I-NEXT: sb t1, 30(a2) +; RV64I-NEXT: sb t0, 31(a2) +; RV64I-NEXT: sb a5, 24(a2) ; RV64I-NEXT: sb t6, 25(a2) ; RV64I-NEXT: sb t5, 26(a2) ; RV64I-NEXT: sb t4, 27(a2) @@ -2438,47 +2316,45 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV64I-NEXT: srli s3, a0, 56 ; RV64I-NEXT: srli s4, a0, 48 ; RV64I-NEXT: srli s5, a0, 40 -; RV64I-NEXT: srli s6, a0, 32 ; RV64I-NEXT: sb a7, 20(a2) ; RV64I-NEXT: sb a6, 21(a2) ; RV64I-NEXT: sb a5, 22(a2) ; RV64I-NEXT: sb a4, 23(a2) -; RV64I-NEXT: srli a4, a0, 24 +; RV64I-NEXT: srli a4, a0, 32 ; RV64I-NEXT: sb a3, 16(a2) ; RV64I-NEXT: sb t2, 17(a2) ; RV64I-NEXT: sb t1, 18(a2) ; RV64I-NEXT: sb t0, 19(a2) -; RV64I-NEXT: srli a3, a0, 16 +; RV64I-NEXT: srli a3, a0, 24 ; RV64I-NEXT: sb t6, 4(a2) ; RV64I-NEXT: sb t5, 5(a2) ; RV64I-NEXT: sb t4, 6(a2) ; RV64I-NEXT: sb t3, 7(a2) -; RV64I-NEXT: srli a5, a0, 8 +; RV64I-NEXT: srli a5, a0, 16 ; RV64I-NEXT: sb a1, 0(a2) ; RV64I-NEXT: sb s2, 1(a2) ; RV64I-NEXT: sb s1, 2(a2) ; RV64I-NEXT: sb s0, 3(a2) -; RV64I-NEXT: sb s6, 12(a2) +; RV64I-NEXT: srli a1, a0, 8 +; RV64I-NEXT: sb a4, 12(a2) ; RV64I-NEXT: sb s5, 13(a2) ; RV64I-NEXT: sb s4, 14(a2) ; RV64I-NEXT: sb s3, 15(a2) ; RV64I-NEXT: sb a0, 8(a2) -; RV64I-NEXT: sb a5, 9(a2) -; RV64I-NEXT: sb a3, 10(a2) -; RV64I-NEXT: sb a4, 11(a2) -; RV64I-NEXT: ld s0, 152(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s1, 144(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s2, 136(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s3, 128(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s4, 120(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s5, 112(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s6, 104(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s7, 96(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s8, 88(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s9, 80(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s10, 72(sp) # 8-byte Folded Reload -; RV64I-NEXT: ld s11, 64(sp) # 8-byte Folded Reload -; RV64I-NEXT: addi sp, sp, 160 +; RV64I-NEXT: sb a1, 9(a2) +; RV64I-NEXT: sb a5, 10(a2) +; RV64I-NEXT: sb a3, 11(a2) +; RV64I-NEXT: ld s0, 136(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s1, 128(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s2, 120(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s3, 112(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s4, 104(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s5, 96(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s6, 88(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s7, 80(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s8, 72(sp) # 8-byte Folded Reload +; RV64I-NEXT: ld s9, 64(sp) # 8-byte Folded Reload +; RV64I-NEXT: addi sp, sp, 144 ; RV64I-NEXT: ret ; ; RV32I-LABEL: ashr_32bytes: @@ -2503,159 +2379,148 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; RV32I-NEXT: lbu a7, 3(a0) ; RV32I-NEXT: lbu a5, 4(a0) ; RV32I-NEXT: lbu t0, 5(a0) -; RV32I-NEXT: lbu t1, 6(a0) -; RV32I-NEXT: lbu t2, 7(a0) -; RV32I-NEXT: lbu t3, 8(a0) -; RV32I-NEXT: lbu t4, 9(a0) -; RV32I-NEXT: lbu t5, 10(a0) -; RV32I-NEXT: lbu t6, 11(a0) -; RV32I-NEXT: lbu s0, 12(a0) -; RV32I-NEXT: lbu s1, 13(a0) -; RV32I-NEXT: lbu s2, 14(a0) -; RV32I-NEXT: lbu s3, 15(a0) -; RV32I-NEXT: lbu s4, 16(a0) -; RV32I-NEXT: lbu s5, 17(a0) -; RV32I-NEXT: lbu s6, 18(a0) -; RV32I-NEXT: lbu s7, 19(a0) +; RV32I-NEXT: lbu t3, 6(a0) +; RV32I-NEXT: lbu t4, 7(a0) +; RV32I-NEXT: lbu t6, 8(a0) +; RV32I-NEXT: lbu s0, 9(a0) +; RV32I-NEXT: lbu s4, 10(a0) +; RV32I-NEXT: lbu s5, 11(a0) +; RV32I-NEXT: lbu s6, 12(a0) +; RV32I-NEXT: lbu s7, 13(a0) +; RV32I-NEXT: lbu s8, 14(a0) +; RV32I-NEXT: lbu s9, 15(a0) +; RV32I-NEXT: lbu s10, 16(a0) +; RV32I-NEXT: lbu s11, 17(a0) +; RV32I-NEXT: lbu s2, 18(a0) +; RV32I-NEXT: lbu s3, 19(a0) ; RV32I-NEXT: slli a4, a4, 8 ; RV32I-NEXT: slli a6, a6, 16 ; RV32I-NEXT: slli a7, a7, 24 ; RV32I-NEXT: or a3, a4, a3 -; RV32I-NEXT: sw a3, 4(sp) # 4-byte Folded Spill ; RV32I-NEXT: or a4, a7, a6 -; RV32I-NEXT: lbu s8, 20(a0) -; RV32I-NEXT: lbu s9, 21(a0) -; RV32I-NEXT: lbu s10, 22(a0) -; RV32I-NEXT: lbu s11, 23(a0) +; RV32I-NEXT: lbu t1, 20(a0) +; RV32I-NEXT: lbu t2, 21(a0) +; RV32I-NEXT: lbu t5, 22(a0) +; RV32I-NEXT: lbu s1, 23(a0) ; RV32I-NEXT: slli t0, t0, 8 -; RV32I-NEXT: slli t1, t1, 16 -; RV32I-NEXT: slli t2, t2, 24 -; RV32I-NEXT: slli t4, t4, 8 -; RV32I-NEXT: slli t5, t5, 16 -; RV32I-NEXT: slli t6, t6, 24 +; RV32I-NEXT: slli t3, t3, 16 +; RV32I-NEXT: slli t4, t4, 24 +; RV32I-NEXT: slli s0, s0, 8 +; RV32I-NEXT: slli s4, s4, 16 +; RV32I-NEXT: slli s5, s5, 24 ; RV32I-NEXT: or a5, t0, a5 -; RV32I-NEXT: or a6, t2, t1 -; RV32I-NEXT: or a7, t4, t3 -; RV32I-NEXT: or t0, t6, t5 -; RV32I-NEXT: lbu ra, 24(a0) -; RV32I-NEXT: lbu a3, 25(a0) -; RV32I-NEXT: lbu t4, 26(a0) -; RV32I-NEXT: lbu t5, 27(a0) -; RV32I-NEXT: slli s1, s1, 8 +; RV32I-NEXT: or a6, t4, t3 +; RV32I-NEXT: or a7, s0, t6 +; RV32I-NEXT: or t0, s5, s4 +; RV32I-NEXT: lbu t3, 24(a0) +; RV32I-NEXT: lbu s4, 25(a0) +; RV32I-NEXT: lbu s5, 26(a0) +; RV32I-NEXT: lbu ra, 27(a0) +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: slli s11, s11, 8 +; RV32I-NEXT: or t4, s7, s6 +; RV32I-NEXT: or t6, s9, s8 +; RV32I-NEXT: or s0, s11, s10 +; RV32I-NEXT: lbu s6, 28(a0) +; RV32I-NEXT: lbu s7, 29(a0) +; RV32I-NEXT: lbu s8, 30(a0) +; RV32I-NEXT: lbu s9, 31(a0) +; RV32I-NEXT: lbu a0, 0(a1) ; RV32I-NEXT: slli s2, s2, 16 ; RV32I-NEXT: slli s3, s3, 24 -; RV32I-NEXT: slli s5, s5, 8 -; RV32I-NEXT: or t1, s1, s0 -; RV32I-NEXT: or t2, s3, s2 -; RV32I-NEXT: or t3, s5, s4 -; RV32I-NEXT: lbu t6, 28(a0) -; RV32I-NEXT: lbu s0, 29(a0) -; RV32I-NEXT: lbu s1, 30(a0) -; RV32I-NEXT: lbu a0, 31(a0) -; RV32I-NEXT: slli s6, s6, 16 -; RV32I-NEXT: slli s7, s7, 24 -; RV32I-NEXT: slli s9, s9, 8 -; RV32I-NEXT: slli s10, s10, 16 -; RV32I-NEXT: slli s11, s11, 24 -; RV32I-NEXT: or s2, s7, s6 -; RV32I-NEXT: or s3, s9, s8 -; RV32I-NEXT: or s4, s11, s10 -; RV32I-NEXT: lbu s5, 0(a1) -; RV32I-NEXT: lbu s6, 1(a1) -; RV32I-NEXT: lbu s7, 2(a1) -; RV32I-NEXT: lbu a1, 3(a1) -; RV32I-NEXT: slli a3, a3, 8 -; RV32I-NEXT: or a3, a3, ra -; RV32I-NEXT: addi s8, sp, 8 -; RV32I-NEXT: slli t4, t4, 16 -; RV32I-NEXT: slli t5, t5, 24 -; RV32I-NEXT: slli s0, s0, 8 -; RV32I-NEXT: slli s1, s1, 16 -; RV32I-NEXT: slli a0, a0, 24 -; RV32I-NEXT: slli s6, s6, 8 -; RV32I-NEXT: slli s7, s7, 16 -; RV32I-NEXT: slli a1, a1, 24 -; RV32I-NEXT: or t4, t5, t4 -; RV32I-NEXT: or t5, s0, t6 -; RV32I-NEXT: or s1, a0, s1 -; RV32I-NEXT: or t6, s6, s5 -; RV32I-NEXT: or a1, a1, s7 -; RV32I-NEXT: srai s0, a0, 31 -; RV32I-NEXT: lw a0, 4(sp) # 4-byte Folded Reload -; RV32I-NEXT: or a4, a4, a0 -; RV32I-NEXT: or a5, a6, a5 -; RV32I-NEXT: or a6, t0, a7 -; RV32I-NEXT: or a7, t2, t1 -; RV32I-NEXT: or t0, s2, t3 -; RV32I-NEXT: or t1, s4, s3 -; RV32I-NEXT: or a3, t4, a3 +; RV32I-NEXT: or s2, s3, s2 +; RV32I-NEXT: addi s3, sp, 8 +; RV32I-NEXT: slli t2, t2, 8 +; RV32I-NEXT: slli t5, t5, 16 +; RV32I-NEXT: slli s1, s1, 24 +; RV32I-NEXT: slli s4, s4, 8 +; RV32I-NEXT: slli s5, s5, 16 +; RV32I-NEXT: slli ra, ra, 24 +; RV32I-NEXT: slli s7, s7, 8 +; RV32I-NEXT: slli s8, s8, 16 +; RV32I-NEXT: slli s9, s9, 24 +; RV32I-NEXT: or t1, t2, t1 +; RV32I-NEXT: srli a1, a0, 3 ; RV32I-NEXT: or t2, s1, t5 -; RV32I-NEXT: or a0, a1, t6 -; RV32I-NEXT: sw s0, 56(sp) -; RV32I-NEXT: sw s0, 60(sp) -; RV32I-NEXT: sw s0, 64(sp) -; RV32I-NEXT: sw s0, 68(sp) -; RV32I-NEXT: sw s0, 40(sp) -; RV32I-NEXT: sw s0, 44(sp) -; RV32I-NEXT: sw s0, 48(sp) -; RV32I-NEXT: sw s0, 52(sp) -; RV32I-NEXT: sw t0, 24(sp) -; RV32I-NEXT: sw t1, 28(sp) -; RV32I-NEXT: sw a3, 32(sp) +; RV32I-NEXT: andi t5, a0, 31 +; RV32I-NEXT: or t3, s4, t3 +; RV32I-NEXT: or s1, ra, s5 +; RV32I-NEXT: or s4, s7, s6 +; RV32I-NEXT: or s5, s9, s8 +; RV32I-NEXT: srai s6, s9, 31 +; RV32I-NEXT: andi s7, a1, 28 +; RV32I-NEXT: xori a1, t5, 31 +; RV32I-NEXT: or a3, a4, a3 +; RV32I-NEXT: or a4, a6, a5 +; RV32I-NEXT: or a5, t0, a7 +; RV32I-NEXT: or a6, t6, t4 +; RV32I-NEXT: or a7, s2, s0 +; RV32I-NEXT: or t0, t2, t1 +; RV32I-NEXT: or t1, s1, t3 +; RV32I-NEXT: or t2, s5, s4 +; RV32I-NEXT: sw s6, 56(sp) +; RV32I-NEXT: sw s6, 60(sp) +; RV32I-NEXT: sw s6, 64(sp) +; RV32I-NEXT: sw s6, 68(sp) +; RV32I-NEXT: sw s6, 40(sp) +; RV32I-NEXT: sw s6, 44(sp) +; RV32I-NEXT: sw s6, 48(sp) +; RV32I-NEXT: sw s6, 52(sp) +; RV32I-NEXT: add s3, s3, s7 +; RV32I-NEXT: sw a7, 24(sp) +; RV32I-NEXT: sw t0, 28(sp) +; RV32I-NEXT: sw t1, 32(sp) ; RV32I-NEXT: sw t2, 36(sp) -; RV32I-NEXT: sw a4, 8(sp) -; RV32I-NEXT: sw a5, 12(sp) -; RV32I-NEXT: sw a6, 16(sp) -; RV32I-NEXT: sw a7, 20(sp) -; RV32I-NEXT: srli a1, a0, 3 -; RV32I-NEXT: andi a3, a0, 31 -; RV32I-NEXT: andi a4, a1, 28 -; RV32I-NEXT: xori a1, a3, 31 -; RV32I-NEXT: add a4, s8, a4 -; RV32I-NEXT: lw a3, 0(a4) -; RV32I-NEXT: lw a5, 4(a4) -; RV32I-NEXT: lw a6, 8(a4) -; RV32I-NEXT: lw a7, 12(a4) -; RV32I-NEXT: lw t0, 16(a4) -; RV32I-NEXT: lw t1, 20(a4) -; RV32I-NEXT: lw t2, 24(a4) -; RV32I-NEXT: lw a4, 28(a4) -; RV32I-NEXT: srl t3, a5, a0 -; RV32I-NEXT: slli t4, a6, 1 +; RV32I-NEXT: sw a3, 8(sp) +; RV32I-NEXT: sw a4, 12(sp) +; RV32I-NEXT: sw a5, 16(sp) +; RV32I-NEXT: sw a6, 20(sp) +; RV32I-NEXT: lw a3, 0(s3) +; RV32I-NEXT: lw a4, 4(s3) +; RV32I-NEXT: lw a5, 8(s3) +; RV32I-NEXT: lw a6, 12(s3) +; RV32I-NEXT: lw a7, 16(s3) +; RV32I-NEXT: lw t0, 20(s3) +; RV32I-NEXT: lw t1, 24(s3) +; RV32I-NEXT: lw t2, 28(s3) +; RV32I-NEXT: srl t3, a4, a0 +; RV32I-NEXT: slli t4, a5, 1 ; RV32I-NEXT: srl a3, a3, a0 -; RV32I-NEXT: slli a5, a5, 1 -; RV32I-NEXT: srl t5, a7, a0 -; RV32I-NEXT: slli t6, t0, 1 -; RV32I-NEXT: srl a6, a6, a0 -; RV32I-NEXT: slli a7, a7, 1 -; RV32I-NEXT: srl s0, t1, a0 -; RV32I-NEXT: slli s1, t2, 1 -; RV32I-NEXT: srl t0, t0, a0 -; RV32I-NEXT: slli t1, t1, 1 -; RV32I-NEXT: srl t2, t2, a0 -; RV32I-NEXT: slli s2, a4, 1 -; RV32I-NEXT: sra s3, a4, a0 +; RV32I-NEXT: slli a4, a4, 1 +; RV32I-NEXT: srl t5, a6, a0 +; RV32I-NEXT: slli t6, a7, 1 +; RV32I-NEXT: srl a5, a5, a0 +; RV32I-NEXT: slli a6, a6, 1 +; RV32I-NEXT: srl s0, t0, a0 +; RV32I-NEXT: slli s1, t1, 1 +; RV32I-NEXT: srl a7, a7, a0 +; RV32I-NEXT: slli t0, t0, 1 +; RV32I-NEXT: srl t1, t1, a0 +; RV32I-NEXT: slli s2, t2, 1 +; RV32I-NEXT: sra t2, t2, a0 ; RV32I-NEXT: sll a0, t4, a1 -; RV32I-NEXT: sll a4, a5, a1 -; RV32I-NEXT: sll a5, t6, a1 -; RV32I-NEXT: sll a7, a7, a1 -; RV32I-NEXT: sll t4, s1, a1 -; RV32I-NEXT: sll t1, t1, a1 -; RV32I-NEXT: sll t6, s2, a1 -; RV32I-NEXT: srli s1, s3, 24 -; RV32I-NEXT: srli s2, s3, 16 -; RV32I-NEXT: srli s4, s3, 8 +; RV32I-NEXT: sll a4, a4, a1 +; RV32I-NEXT: sll t4, t6, a1 +; RV32I-NEXT: sll a6, a6, a1 +; RV32I-NEXT: sll t6, s1, a1 +; RV32I-NEXT: sll t0, t0, a1 +; RV32I-NEXT: sll s1, s2, a1 +; RV32I-NEXT: srli s2, t2, 24 +; RV32I-NEXT: srli s3, t2, 16 +; RV32I-NEXT: srli s4, t2, 8 ; RV32I-NEXT: or a0, t3, a0 ; RV32I-NEXT: or a1, a3, a4 -; RV32I-NEXT: or a3, t5, a5 -; RV32I-NEXT: or a4, a6, a7 -; RV32I-NEXT: or a5, s0, t4 -; RV32I-NEXT: or a6, t0, t1 -; RV32I-NEXT: or a7, t2, t6 -; RV32I-NEXT: sb s3, 28(a2) +; RV32I-NEXT: or a3, t5, t4 +; RV32I-NEXT: or a4, a5, a6 +; RV32I-NEXT: or a5, s0, t6 +; RV32I-NEXT: or a6, a7, t0 +; RV32I-NEXT: or a7, t1, s1 +; RV32I-NEXT: sb t2, 28(a2) ; RV32I-NEXT: sb s4, 29(a2) -; RV32I-NEXT: sb s2, 30(a2) -; RV32I-NEXT: sb s1, 31(a2) +; RV32I-NEXT: sb s3, 30(a2) +; RV32I-NEXT: sb s2, 31(a2) ; RV32I-NEXT: srli t0, a7, 24 ; RV32I-NEXT: srli t1, a7, 16 ; RV32I-NEXT: srli t2, a7, 8 diff --git a/llvm/test/CodeGen/SystemZ/pr60413.ll b/llvm/test/CodeGen/SystemZ/pr60413.ll index 6dee3d303a6f2..bbf4d50bd716d 100644 --- a/llvm/test/CodeGen/SystemZ/pr60413.ll +++ b/llvm/test/CodeGen/SystemZ/pr60413.ll @@ -13,73 +13,67 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #0 define dso_local void @m() local_unnamed_addr #1 { ; CHECK-LABEL: m: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: stmg %r13, %r15, 104(%r15) ; CHECK-NEXT: aghi %r15, -168 ; CHECK-NEXT: lhrl %r1, f+4 -; CHECK-NEXT: sll %r1, 8 ; CHECK-NEXT: larl %r2, f -; CHECK-NEXT: ic %r1, 6(%r2) -; CHECK-NEXT: larl %r2, e -; CHECK-NEXT: lb %r0, 3(%r2) -; CHECK-NEXT: vlvgf %v1, %r1, 0 -; CHECK-NEXT: vlvgf %v1, %r1, 1 -; CHECK-NEXT: larl %r2, .LCPI0_0 -; CHECK-NEXT: vl %v2, 0(%r2), 3 -; CHECK-NEXT: vlvgf %v1, %r1, 3 -; CHECK-NEXT: vlvgf %v3, %r1, 3 -; CHECK-NEXT: vlvgf %v0, %r1, 1 -; CHECK-NEXT: vperm %v4, %v1, %v0, %v2 -; CHECK-NEXT: vlvgf %v0, %r1, 3 +; CHECK-NEXT: llc %r2, 6(%r2) +; CHECK-NEXT: larl %r3, e +; CHECK-NEXT: lb %r0, 3(%r3) +; CHECK-NEXT: rosbg %r2, %r1, 32, 55, 8 +; CHECK-NEXT: vlvgp %v0, %r2, %r0 +; CHECK-NEXT: vlvgf %v0, %r2, 0 +; CHECK-NEXT: vlvgf %v0, %r2, 2 +; CHECK-NEXT: vlvgp %v1, %r0, %r2 +; CHECK-NEXT: vlvgp %v2, %r2, %r2 +; CHECK-NEXT: lr %r1, %r2 ; CHECK-NEXT: nilh %r1, 255 ; CHECK-NEXT: chi %r1, 128 ; CHECK-NEXT: ipm %r1 ; CHECK-NEXT: risbg %r1, %r1, 63, 191, 36 -; CHECK-NEXT: vperm %v0, %v3, %v0, %v2 -; CHECK-NEXT: larl %r2, .LCPI0_1 -; CHECK-NEXT: vl %v5, 0(%r2), 3 -; CHECK-NEXT: vgbm %v6, 30583 -; CHECK-NEXT: vn %v0, %v0, %v6 -; CHECK-NEXT: vn %v4, %v4, %v6 -; CHECK-NEXT: vperm %v1, %v1, %v1, %v5 -; CHECK-NEXT: vn %v1, %v1, %v6 -; CHECK-NEXT: vperm %v2, %v0, %v3, %v2 -; CHECK-NEXT: vn %v2, %v2, %v6 +; CHECK-NEXT: vgbm %v3, 30583 +; CHECK-NEXT: vn %v0, %v0, %v3 +; CHECK-NEXT: vlvgf %v1, %r0, 0 +; CHECK-NEXT: vlvgf %v1, %r0, 2 +; CHECK-NEXT: vn %v1, %v1, %v3 +; CHECK-NEXT: vrepf %v2, %v2, 1 +; CHECK-NEXT: vn %v2, %v2, %v3 ; CHECK-NEXT: vrepif %v3, 127 -; CHECK-NEXT: vchlf %v1, %v1, %v3 -; CHECK-NEXT: vlgvf %r3, %v1, 1 -; CHECK-NEXT: vlgvf %r2, %v1, 0 -; CHECK-NEXT: risbg %r2, %r2, 48, 176, 15 -; CHECK-NEXT: rosbg %r2, %r3, 49, 49, 14 -; CHECK-NEXT: vlgvf %r3, %v1, 2 -; CHECK-NEXT: rosbg %r2, %r3, 50, 50, 13 -; CHECK-NEXT: vlgvf %r3, %v1, 3 -; CHECK-NEXT: rosbg %r2, %r3, 51, 51, 12 -; CHECK-NEXT: vchlf %v1, %v4, %v3 -; CHECK-NEXT: vlgvf %r3, %v1, 0 -; CHECK-NEXT: rosbg %r2, %r3, 52, 52, 11 -; CHECK-NEXT: vlgvf %r3, %v1, 1 -; CHECK-NEXT: rosbg %r2, %r3, 53, 53, 10 -; CHECK-NEXT: vlgvf %r3, %v1, 2 -; CHECK-NEXT: rosbg %r2, %r3, 54, 54, 9 -; CHECK-NEXT: vlgvf %r3, %v1, 3 -; CHECK-NEXT: rosbg %r2, %r3, 55, 55, 8 -; CHECK-NEXT: vchlf %v1, %v2, %v3 -; CHECK-NEXT: vlgvf %r3, %v1, 0 -; CHECK-NEXT: rosbg %r2, %r3, 56, 56, 7 -; CHECK-NEXT: vlgvf %r3, %v1, 1 -; CHECK-NEXT: rosbg %r2, %r3, 57, 57, 6 -; CHECK-NEXT: vlgvf %r3, %v1, 2 -; CHECK-NEXT: rosbg %r2, %r3, 58, 58, 5 -; CHECK-NEXT: vlgvf %r3, %v1, 3 -; CHECK-NEXT: rosbg %r2, %r3, 59, 59, 4 ; CHECK-NEXT: vchlf %v0, %v0, %v3 -; CHECK-NEXT: vlgvf %r3, %v0, 0 -; CHECK-NEXT: rosbg %r2, %r3, 60, 60, 3 -; CHECK-NEXT: vlgvf %r3, %v0, 1 -; CHECK-NEXT: rosbg %r2, %r3, 61, 61, 2 -; CHECK-NEXT: vlgvf %r3, %v0, 2 -; CHECK-NEXT: rosbg %r2, %r3, 62, 62, 1 -; CHECK-NEXT: vlgvf %r3, %v0, 3 -; CHECK-NEXT: rosbg %r2, %r3, 63, 63, 0 +; CHECK-NEXT: vlgvf %r13, %v0, 0 +; CHECK-NEXT: vchlf %v2, %v2, %v3 +; CHECK-NEXT: vlgvf %r3, %v2, 1 +; CHECK-NEXT: nilf %r3, 1 +; CHECK-NEXT: vlgvf %r4, %v2, 0 +; CHECK-NEXT: risbg %r2, %r4, 48, 176, 15 +; CHECK-NEXT: rosbg %r2, %r3, 32, 49, 14 +; CHECK-NEXT: vlgvf %r5, %v2, 2 +; CHECK-NEXT: nilf %r5, 1 +; CHECK-NEXT: rosbg %r2, %r5, 32, 50, 13 +; CHECK-NEXT: vlgvf %r14, %v2, 3 +; CHECK-NEXT: nilf %r14, 1 +; CHECK-NEXT: rosbg %r2, %r14, 32, 51, 12 +; CHECK-NEXT: rosbg %r2, %r13, 52, 52, 11 +; CHECK-NEXT: vlgvf %r13, %v0, 1 +; CHECK-NEXT: rosbg %r2, %r13, 53, 53, 10 +; CHECK-NEXT: vlgvf %r13, %v0, 2 +; CHECK-NEXT: rosbg %r2, %r13, 54, 54, 9 +; CHECK-NEXT: vlgvf %r13, %v0, 3 +; CHECK-NEXT: rosbg %r2, %r13, 55, 55, 8 +; CHECK-NEXT: vchlf %v0, %v1, %v3 +; CHECK-NEXT: vlgvf %r13, %v0, 0 +; CHECK-NEXT: rosbg %r2, %r13, 56, 56, 7 +; CHECK-NEXT: vlgvf %r13, %v0, 1 +; CHECK-NEXT: rosbg %r2, %r13, 57, 57, 6 +; CHECK-NEXT: vlgvf %r13, %v0, 2 +; CHECK-NEXT: rosbg %r2, %r13, 58, 58, 5 +; CHECK-NEXT: vlgvf %r13, %v0, 3 +; CHECK-NEXT: rosbg %r2, %r13, 59, 59, 4 +; CHECK-NEXT: nilf %r4, 1 +; CHECK-NEXT: rosbg %r2, %r4, 32, 60, 3 +; CHECK-NEXT: rosbg %r2, %r3, 32, 61, 2 +; CHECK-NEXT: rosbg %r2, %r5, 32, 62, 1 +; CHECK-NEXT: or %r2, %r14 ; CHECK-NEXT: vlgvb %r4, %v0, 1 ; CHECK-NEXT: vlgvb %r3, %v0, 0 ; CHECK-NEXT: risbg %r3, %r3, 48, 176, 15 @@ -122,7 +116,7 @@ define dso_local void @m() local_unnamed_addr #1 { ; CHECK-NEXT: nr %r2, %r0 ; CHECK-NEXT: larl %r1, g ; CHECK-NEXT: stc %r2, 0(%r1) -; CHECK-NEXT: aghi %r15, 168 +; CHECK-NEXT: lmg %r13, %r15, 272(%r15) ; CHECK-NEXT: br %r14 entry: %n = alloca i32, align 4 diff --git a/llvm/test/CodeGen/X86/abds-neg.ll b/llvm/test/CodeGen/X86/abds-neg.ll index 6e22d855dc831..f6d66ab47ce05 100644 --- a/llvm/test/CodeGen/X86/abds-neg.ll +++ b/llvm/test/CodeGen/X86/abds-neg.ll @@ -1058,15 +1058,15 @@ define i64 @abd_subnsw_i64(i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi ; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %esi ; X86-NEXT: movl %edx, %eax -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -1089,15 +1089,15 @@ define i64 @abd_subnsw_i64_undef(i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: subl {{[0-9]+}}(%esp), %esi +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx -; X86-NEXT: xorl %edx, %esi ; X86-NEXT: xorl %edx, %ecx +; X86-NEXT: xorl %edx, %esi ; X86-NEXT: movl %edx, %eax -; X86-NEXT: subl %ecx, %eax -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: subl %esi, %eax +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: popl %esi ; X86-NEXT: retl ; @@ -1121,11 +1121,11 @@ define i128 @abd_subnsw_i128(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx @@ -1178,11 +1178,11 @@ define i128 @abd_subnsw_i128_undef(i128 %a, i128 %b) nounwind { ; X86-NEXT: pushl %ebx ; X86-NEXT: pushl %edi ; X86-NEXT: pushl %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: subl {{[0-9]+}}(%esp), %edi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %esi ; X86-NEXT: sbbl {{[0-9]+}}(%esp), %edx diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll index 0de308a9e0738..4e4891a283ce9 100644 --- a/llvm/test/CodeGen/X86/avg.ll +++ b/llvm/test/CodeGen/X86/avg.ll @@ -1734,20 +1734,20 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-LABEL: not_avg_v16i8_wide_constants: ; SSE2: # %bb.0: ; SSE2-NEXT: movaps (%rdi), %xmm1 -; SSE2-NEXT: movdqa (%rsi), %xmm0 +; SSE2-NEXT: movdqa (%rsi), %xmm2 ; SSE2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm1 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm3 +; SSE2-NEXT: movd %eax, %xmm4 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm4 +; SSE2-NEXT: movd %eax, %xmm3 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm5 @@ -1762,9 +1762,6 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm8 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: movd %eax, %xmm10 -; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax -; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm9 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1774,6 +1771,9 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm12 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax +; SSE2-NEXT: movd %eax, %xmm10 +; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax +; SSE2-NEXT: decl %eax ; SSE2-NEXT: movd %eax, %xmm13 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax @@ -1783,45 +1783,43 @@ define void @not_avg_v16i8_wide_constants(ptr %a, ptr %b) nounwind { ; SSE2-NEXT: movd %eax, %xmm15 ; SSE2-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax ; SSE2-NEXT: decl %eax -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm4[0,0,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm6[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm6[0,0,0,0] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm8[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm1[0],xmm4[1] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm4 = xmm4[2],xmm1[2],xmm4[3],xmm1[3] +; SSE2-NEXT: movsd {{.*#+}} xmm4 = xmm3[0],xmm4[1] ; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7] ; SSE2-NEXT: movapd %xmm4, %xmm5 ; SSE2-NEXT: andpd %xmm1, %xmm5 ; SSE2-NEXT: xorpd %xmm4, %xmm1 ; SSE2-NEXT: psrlw $1, %xmm1 ; SSE2-NEXT: paddw %xmm5, %xmm1 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm12[0,0,0,0] -; SSE2-NEXT: punpckldq {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3] +; SSE2-NEXT: punpckldq {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm14[0,0,0,0] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: movsd {{.*#+}} xmm2 = xmm9[0],xmm2[1] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] -; SSE2-NEXT: movapd %xmm2, %xmm3 -; SSE2-NEXT: andpd %xmm0, %xmm3 -; SSE2-NEXT: xorpd %xmm2, %xmm0 -; SSE2-NEXT: psrlw $1, %xmm0 -; SSE2-NEXT: paddw %xmm3, %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] -; SSE2-NEXT: pand %xmm2, %xmm0 -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: packuswb %xmm0, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: movsd {{.*#+}} xmm0 = xmm10[0],xmm0[1] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15] +; SSE2-NEXT: movapd %xmm0, %xmm3 +; SSE2-NEXT: andpd %xmm2, %xmm3 +; SSE2-NEXT: xorpd %xmm0, %xmm2 +; SSE2-NEXT: psrlw $1, %xmm2 +; SSE2-NEXT: paddw %xmm3, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: packuswb %xmm2, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll index 8f82a5bc6554e..d869f8ec01a5a 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll @@ -172,9 +172,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: sbbl %eax, %edi ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl 40(%ebp), %esi -; X86-NEXT: movl %esi, %edx +; X86-NEXT: movl 40(%ebp), %ecx +; X86-NEXT: movl %ecx, %edx ; X86-NEXT: sarl $31, %edx +; X86-NEXT: movl %ecx, %esi ; X86-NEXT: xorl %edx, %esi ; X86-NEXT: movl 36(%ebp), %ecx ; X86-NEXT: xorl %edx, %ecx @@ -203,45 +204,45 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: sete %al ; X86-NEXT: orb %cl, %al ; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: bsrl %esi, %edx -; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %ecx +; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: addl $32, %edx +; X86-NEXT: bsrl %esi, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %esi, %esi -; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: cmovel %edx, %ecx ; X86-NEXT: bsrl %ebx, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: bsrl %edi, %edi ; X86-NEXT: xorl $31, %edi -; X86-NEXT: orl $32, %edi +; X86-NEXT: addl $32, %edi ; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %edi -; X86-NEXT: orl $64, %edi +; X86-NEXT: addl $64, %edi ; X86-NEXT: movl %eax, %edx ; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: bsrl %eax, %edx +; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx -; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; X86-NEXT: addl $32, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: bsrl %eax, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx ; X86-NEXT: testl %eax, %eax -; X86-NEXT: cmovnel %edx, %ecx +; X86-NEXT: cmovel %edx, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: bsrl %ebx, %esi ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload ; X86-NEXT: xorl $31, %edx -; X86-NEXT: orl $32, %edx +; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx +; X86-NEXT: addl $64, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-NEXT: orl %eax, %esi ; X86-NEXT: cmovnel %ecx, %edx @@ -379,9 +380,9 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: adcl $-1, %eax ; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: adcl $-1, %ecx -; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: adcl $-1, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: adcl $-1, %ecx ; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll index 0bef9ee50bd54..7bbddefd82721 100644 --- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll +++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll @@ -173,17 +173,17 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl 36(%ebp), %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %esi, %esi ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: bsrl %edi, %edx ; X86-NEXT: xorl $31, %edx ; X86-NEXT: bsrl %ebx, %eax ; X86-NEXT: xorl $31, %eax -; X86-NEXT: orl $32, %eax +; X86-NEXT: addl $32, %eax ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %edx, %eax -; X86-NEXT: orl $64, %eax +; X86-NEXT: addl $64, %eax ; X86-NEXT: movl 36(%ebp), %edx ; X86-NEXT: orl %esi, %edx ; X86-NEXT: cmovnel %ecx, %eax @@ -193,7 +193,7 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: movl 20(%ebp), %ecx ; X86-NEXT: bsrl %ecx, %ecx ; X86-NEXT: xorl $31, %ecx -; X86-NEXT: orl $32, %ecx +; X86-NEXT: addl $32, %ecx ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: cmovnel %edx, %ecx ; X86-NEXT: movl 16(%ebp), %edi @@ -201,10 +201,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind { ; X86-NEXT: xorl $31, %esi ; X86-NEXT: bsrl 12(%ebp), %edx ; X86-NEXT: xorl $31, %edx -; X86-NEXT: orl $32, %edx +; X86-NEXT: addl $32, %edx ; X86-NEXT: testl %edi, %edi ; X86-NEXT: cmovnel %esi, %edx -; X86-NEXT: orl $64, %edx +; X86-NEXT: addl $64, %edx ; X86-NEXT: movl 20(%ebp), %edi ; X86-NEXT: movl %edi, %esi ; X86-NEXT: orl %ebx, %esi diff --git a/llvm/test/CodeGen/X86/freeze-vector.ll b/llvm/test/CodeGen/X86/freeze-vector.ll index 953a5e7285fe4..0f66d42697d97 100644 --- a/llvm/test/CodeGen/X86/freeze-vector.ll +++ b/llvm/test/CodeGen/X86/freeze-vector.ll @@ -171,15 +171,15 @@ define void @freeze_extractelement(ptr %origin0, ptr %origin1, ptr %dst) nounwin ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: vmovdqa (%edx), %xmm0 -; X86-NEXT: vpand (%ecx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%ecx), %xmm0 +; X86-NEXT: vpand (%edx), %xmm0, %xmm0 ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: freeze_extractelement: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm0 -; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rsi), %xmm0 +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq %i0 = load <16 x i8>, ptr %origin0 @@ -198,8 +198,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: vmovdqa (%esi), %xmm0 -; X86-NEXT: vpand (%edx), %xmm0, %xmm0 +; X86-NEXT: vmovdqa (%edx), %xmm0 +; X86-NEXT: vpand (%esi), %xmm0, %xmm0 ; X86-NEXT: vmovdqa %xmm0, (%ecx) ; X86-NEXT: vpextrb $6, %xmm0, (%eax) ; X86-NEXT: popl %esi @@ -207,8 +207,8 @@ define void @freeze_extractelement_escape(ptr %origin0, ptr %origin1, ptr %dst, ; ; X64-LABEL: freeze_extractelement_escape: ; X64: # %bb.0: -; X64-NEXT: vmovdqa (%rdi), %xmm0 -; X64-NEXT: vpand (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovdqa (%rsi), %xmm0 +; X64-NEXT: vpand (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovdqa %xmm0, (%rcx) ; X64-NEXT: vpextrb $6, %xmm0, (%rdx) ; X64-NEXT: retq @@ -239,8 +239,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X86-NEXT: movl 32(%ebp), %edx ; X86-NEXT: movl 12(%ebp), %esi ; X86-NEXT: movl 8(%ebp), %edi -; X86-NEXT: vmovaps (%edi), %xmm0 -; X86-NEXT: vandps (%esi), %xmm0, %xmm0 +; X86-NEXT: vmovaps (%esi), %xmm0 +; X86-NEXT: vandps (%edi), %xmm0, %xmm0 ; X86-NEXT: vmovaps %xmm0, (%esp) ; X86-NEXT: movzbl (%esp,%ecx), %ecx ; X86-NEXT: cmpb (%esp,%eax), %cl @@ -255,8 +255,8 @@ define void @freeze_extractelement_extra_use(ptr %origin0, ptr %origin1, i64 %id ; X64: # %bb.0: ; X64-NEXT: andl $15, %ecx ; X64-NEXT: andl $15, %edx -; X64-NEXT: vmovaps (%rdi), %xmm0 -; X64-NEXT: vandps (%rsi), %xmm0, %xmm0 +; X64-NEXT: vmovaps (%rsi), %xmm0 +; X64-NEXT: vandps (%rdi), %xmm0, %xmm0 ; X64-NEXT: vmovaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NEXT: movzbl -24(%rsp,%rdx), %eax ; X64-NEXT: cmpb -24(%rsp,%rcx), %al diff --git a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll index 2ac2be5545dfd..d2b292f1a7996 100644 --- a/llvm/test/CodeGen/X86/setcc-non-simple-type.ll +++ b/llvm/test/CodeGen/X86/setcc-non-simple-type.ll @@ -119,8 +119,8 @@ define void @failing(ptr %0, ptr %1) nounwind { ; CHECK-AVX2-NEXT: .LBB0_2: # %vector.body ; CHECK-AVX2-NEXT: # Parent Loop BB0_1 Depth=1 ; CHECK-AVX2-NEXT: # => This Inner Loop Header: Depth=2 -; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %xmm5 -; CHECK-AVX2-NEXT: vmovdqu 1040(%rdx,%rsi), %xmm6 +; CHECK-AVX2-NEXT: vmovdqu 1024(%rdx,%rsi), %ymm5 +; CHECK-AVX2-NEXT: vextracti128 $1, %ymm5, %xmm6 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm5, %rdi ; CHECK-AVX2-NEXT: vpextrq $1, %xmm6, %r8 ; CHECK-AVX2-NEXT: vmovq %xmm5, %r9