From 44a3268b9cd043ac96dc50f1f3b339c2307f20d1 Mon Sep 17 00:00:00 2001 From: Rajveer Date: Sun, 22 Jun 2025 17:39:34 +0530 Subject: [PATCH 1/2] [VectorCombine] New folding pattern for extract/binop/shuffle chains Resolves #144654 Part of #143088 This adds a new `foldShuffleChainsToReduce` for horizontal reduction of patterns like: ```llvm define i16 @test_reduce_v8i16(<8 x i16> %a0) local_unnamed_addr #0 { %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) %7 = extractelement <8 x i16> %6, i64 0 ret i16 %7 } ``` ...which can be reduced to a llvm.vector.reduce.umin.v8i16(%a0) intrinsic call. Similar transformation for other ops when costs permit to do so. --- .../Transforms/Vectorize/VectorCombine.cpp | 177 ++++++++++++++++ .../X86/shuffle-chain-reduction-umin.ll | 200 ++++++++++++++++++ .../fold-shuffle-chains-to-reduce.ll | 127 +++++++++++ 3 files changed, 504 insertions(+) create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll create mode 100644 llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 19e82099e87f0..c7cc8290e88e5 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -130,6 +130,7 @@ class VectorCombine { bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); @@ -2988,6 +2989,179 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return foldSelectShuffle(*Shuffle, true); } +bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + auto *EEI = dyn_cast(&I); + if (!EEI) + return false; + + std::queue InstWorklist; + Value *InitEEV = nullptr; + Intrinsic::ID CommonOp = 0; + + bool IsFirstCallInst = true; + bool ShouldBeCallInst = true; + + SmallVector PrevVecV(3, nullptr); + int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1; + int64_t VecSize = -1; + + Value *VecOp; + if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) + return false; + + auto *FVT = dyn_cast(VecOp->getType()); + if (!FVT) + return false; + + VecSize = FVT->getNumElements(); + if (VecSize < 2 || (VecSize % 2) != 0) + return false; + + ShuffleMaskHalf = 1; + PrevVecV[2] = VecOp; + InitEEV = EEI; + + InstWorklist.push(PrevVecV[2]); + + while (!InstWorklist.empty()) { + Value *V = InstWorklist.front(); + InstWorklist.pop(); + + auto *CI = dyn_cast(V); + if (!CI) + return false; + + if (auto *CallI = dyn_cast(CI)) { + if (!ShouldBeCallInst || !PrevVecV[2]) + return false; + + if (!IsFirstCallInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallInst = false; + + auto *II = dyn_cast(CallI); + if (!II) + return false; + + if (!CommonOp) + CommonOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != CommonOp) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::smin: + case Intrinsic::smax: { + auto *Op0 = CallI->getOperand(0); + auto *Op1 = CallI->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallInst ^= 1; + + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *SVInst = dyn_cast(CI)) { + if (ShouldBeCallInst || + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (SVInst != PrevVecV[1]) + return false; + + auto *ShuffleVec = SVInst->getOperand(0); + if (!ShuffleVec || ShuffleVec != PrevVecV[0]) + return false; + + SmallVector CurMask; + SVInst->getShuffleMask(CurMask); + + if (ShuffleMaskHalf != ExpectedShuffleMaskHalf) + return false; + ExpectedShuffleMaskHalf *= 2; + + for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { + if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask) + return false; + if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) + return false; + } + ShuffleMaskHalf *= 2; + if (ExpectedShuffleMaskHalf == VecSize) + break; + ShouldBeCallInst ^= 1; + } else { + return false; + } + } + + if (ShouldBeCallInst) + return false; + + assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize && + "Expected Match for Vector Size and Mask Half"); + + Value *FinalVecV = PrevVecV[0]; + auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); + + if (!InitEEV || !FinalVecV) + return false; + + assert(FinalVecVTy && "Expected non-null value for Vector Type"); + + Intrinsic::ID ReducedOp = 0; + switch (CommonOp) { + case Intrinsic::umin: + ReducedOp = Intrinsic::vector_reduce_umin; + break; + case Intrinsic::umax: + ReducedOp = Intrinsic::vector_reduce_umax; + break; + case Intrinsic::smin: + ReducedOp = Intrinsic::vector_reduce_smin; + break; + case Intrinsic::smax: + ReducedOp = Intrinsic::vector_reduce_smax; + break; + default: + return false; + } + + InstructionCost OrigCost = 0; + unsigned int NumLevels = Log2_64(VecSize); + + for (unsigned int Level = 0; Level < NumLevels; ++Level) { + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + FinalVecVTy, FinalVecVTy); + OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy); + } + OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy, + CostKind, 0); + + IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); + InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + + if (NewCost >= OrigCost) + return false; + + auto *ReducedResult = + Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); + replaceValue(*InitEEV, *ReducedResult); + + return true; +} + /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). @@ -3705,6 +3879,9 @@ bool VectorCombine::run() { MadeChange |= foldShuffleFromReductions(I); MadeChange |= foldCastFromReductions(I); break; + case Instruction::ExtractElement: + MadeChange |= foldShuffleChainsToReduce(I); + break; case Instruction::ICmp: case Instruction::FCmp: MadeChange |= foldExtractExtract(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll new file mode 100644 index 0000000000000..82b20ccc5b8f5 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; +; CHECK-LABEL: define i8 @test_reduce_v16i8( +; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP8]] +; + %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> + %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1) + %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> + %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3) + %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> + %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5) + %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> + %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7) + %9 = extractelement <16 x i8> %8, i64 0 + ret i8 %9 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v32i8( +; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> + %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1) + %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> + %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3) + %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> + %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5) + %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> + %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7) + %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> + %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9) + %11 = extractelement <32 x i8> %10, i64 0 + ret i8 %11 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v16i16( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> + %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5) + %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> + %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7) + %9 = extractelement <16 x i16> %8, i64 0 + ret i16 %9 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v64i8( +; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> + %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1) + %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> + %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3) + %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> + %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5) + %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> + %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7) + %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> + %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9) + %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> + %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11) + %13 = extractelement <64 x i8> %12, i64 0 + ret i8 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v32i16( +; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> + %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1) + %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> + %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3) + %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> + %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5) + %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> + %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7) + %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> + %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9) + %11 = extractelement <32 x i16> %10, i64 0 + ret i16 %11 +} diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll new file mode 100644 index 0000000000000..3cb25ba4ecce6 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -0,0 +1,127 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]]) +; CHECK-NEXT: ret i16 [[TMP15]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + + %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8) + %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> + %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10) + %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> + %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12) + %14 = extractelement <8 x i16> %13, i64 0 + + %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14) + + ret i16 %15 +} + +define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0 +; CHECK-NEXT: ret i16 [[TMP8]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6) + %8 = extractelement <8 x i16> %7, i64 0 + ret i16 %8 +} From eb9570df3557679ee41e3e098c0d202a2ff95408 Mon Sep 17 00:00:00 2001 From: Rajveer Date: Sat, 28 Jun 2025 16:31:51 +0530 Subject: [PATCH 2/2] Include support for Add/Mul/Or/And/Xor Binary Operations --- .../Transforms/Vectorize/VectorCombine.cpp | 252 +++++++++++++----- .../fold-shuffle-chains-to-reduce.ll | 68 +++++ 2 files changed, 257 insertions(+), 63 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index c7cc8290e88e5..f8fb74de49bd2 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -2989,21 +2989,72 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return foldSelectShuffle(*Shuffle, true); } +/// For a given chain of patterns of the following form: +/// +/// ``` +/// %1 = shufflevector %0, poison mask +/// +/// %2 = tail call llvm.( %0, %1) +/// OR +/// %2 = add/mul/or/and/xor %0, %1 +/// +/// %3 = shufflevector %2, poison mask +/// ... +/// ... +/// %(i - 1) = tail call llvm.( %(i - +/// 3), %(i - 2) +/// OR +/// %(i - 1) = add/mul/or/and/xor %(i - 3), %(i - 2) +/// +/// %(i) = extractelement %(i - 1), 0 +/// ``` +/// +/// Where: +/// `mask` follows a partition pattern: +/// +/// Ex: +/// [n = 8, p = poison] +/// +/// 4 5 6 7 | p p p p +/// 2 3 | p p p p p p +/// 1 | p p p p p p p +/// +/// For powers of 2, there's a consistent pattern, but for other cases +/// the parity of the current half value at each step decides the +/// next partition half (see `ExpectedParityMask` for more logical details +/// in generalising this). +/// +/// Ex: +/// [n = 6] +/// +/// 3 4 5 | p p p +/// 1 2 | p p p p +/// 1 | p p p p p bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + // Going bottom-up for the pattern. auto *EEI = dyn_cast(&I); if (!EEI) return false; std::queue InstWorklist; + InstructionCost OrigCost = 0; + Value *InitEEV = nullptr; - Intrinsic::ID CommonOp = 0; - bool IsFirstCallInst = true; - bool ShouldBeCallInst = true; + // Common instruction operation after each shuffle op. + unsigned int CommonCallOp = 0; + Instruction::BinaryOps CommonBinOp = Instruction::BinaryOpsEnd; + bool IsFirstCallOrBinInst = true; + bool ShouldBeCallOrBinInst = true; + + // This stores the last used instructions for shuffle/common op. + // + // PrevVecV[2] stores the first vector from extract element instruction, + // while PrevVecV[0] / PrevVecV[1] store the last two simultaneous + // instructions from either shuffle/common op. SmallVector PrevVecV(3, nullptr); - int64_t ShuffleMaskHalf = -1, ExpectedShuffleMaskHalf = 1; - int64_t VecSize = -1; Value *VecOp; if (!match(&I, m_ExtractElt(m_Value(VecOp), m_Zero()))) @@ -3013,11 +3064,29 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!FVT) return false; - VecSize = FVT->getNumElements(); - if (VecSize < 2 || (VecSize % 2) != 0) + int64_t VecSize = FVT->getNumElements(); + if (VecSize < 2) return false; - ShuffleMaskHalf = 1; + // Number of levels would be ~log2(n), considering we always partition + // by half for this fold pattern. + unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0; + int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0; + + // This is how we generalise for all element sizes. + // At each step, if vector size is odd, we need non-poison + // values to cover the dominant half so we don't miss out on any element. + // + // This mask will help us retrieve this as we go from bottom to top: + // + // Mask Set -> N = N * 2 - 1 + // Mask Unset -> N = N * 2 + for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1; + Cur = (Cur + 1) / 2, --Mask) { + if (Cur & 1) + ExpectedParityMask |= (1ll << Mask); + } + PrevVecV[2] = VecOp; InitEEV = EEI; @@ -3031,25 +3100,23 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!CI) return false; - if (auto *CallI = dyn_cast(CI)) { - if (!ShouldBeCallInst || !PrevVecV[2]) + if (auto *II = dyn_cast(CI)) { + if (!ShouldBeCallOrBinInst || !PrevVecV[2]) return false; - if (!IsFirstCallInst && + if (!IsFirstCallOrBinInst && any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) return false; - if (CallI != (IsFirstCallInst ? PrevVecV[2] : PrevVecV[0])) - return false; - IsFirstCallInst = false; - - auto *II = dyn_cast(CallI); - if (!II) + // For the first found call/bin op, the vector has to come from the + // extract element op. + if (II != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) return false; + IsFirstCallOrBinInst = false; - if (!CommonOp) - CommonOp = II->getIntrinsicID(); - if (II->getIntrinsicID() != CommonOp) + if (!CommonCallOp) + CommonCallOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != CommonCallOp) return false; switch (II->getIntrinsicID()) { @@ -3057,8 +3124,56 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { case Intrinsic::umax: case Intrinsic::smin: case Intrinsic::smax: { - auto *Op0 = CallI->getOperand(0); - auto *Op1 = CallI->getOperand(1); + auto *Op0 = II->getOperand(0); + auto *Op1 = II->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallOrBinInst ^= 1; + + IntrinsicCostAttributes ICA( + CommonCallOp, II->getType(), + {PrevVecV[0]->getType(), PrevVecV[1]->getType()}); + OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind); + + // We may need a swap here since it can be (a, b) or (b, a) + // and accordinly change as we go up. + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *BinOp = dyn_cast(CI)) { + // Similar logic for bin ops. + + if (!ShouldBeCallOrBinInst || !PrevVecV[2]) + return false; + + if (!IsFirstCallOrBinInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (BinOp != (IsFirstCallOrBinInst ? PrevVecV[2] : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + if (CommonBinOp == Instruction::BinaryOpsEnd) + CommonBinOp = BinOp->getOpcode(); + + if (BinOp->getOpcode() != CommonBinOp) + return false; + + switch (CommonBinOp) { + case BinaryOperator::Add: + case BinaryOperator::Mul: + case BinaryOperator::Or: + case BinaryOperator::And: + case BinaryOperator::Xor: { + auto *Op0 = BinOp->getOperand(0); + auto *Op1 = BinOp->getOperand(1); PrevVecV[0] = Op0; PrevVecV[1] = Op1; break; @@ -3066,14 +3181,19 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { default: return false; } - ShouldBeCallInst ^= 1; + ShouldBeCallOrBinInst ^= 1; + + OrigCost += + TTI.getArithmeticInstrCost(CommonBinOp, BinOp->getType(), CostKind); if (!isa(PrevVecV[1])) std::swap(PrevVecV[0], PrevVecV[1]); InstWorklist.push(PrevVecV[1]); InstWorklist.push(PrevVecV[0]); } else if (auto *SVInst = dyn_cast(CI)) { - if (ShouldBeCallInst || + // We shouldn't have any null values in the previous vectors, + // is so, there was a mismatch in pattern. + if (ShouldBeCallOrBinInst || any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) return false; @@ -3084,70 +3204,76 @@ bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { if (!ShuffleVec || ShuffleVec != PrevVecV[0]) return false; - SmallVector CurMask; - SVInst->getShuffleMask(CurMask); - - if (ShuffleMaskHalf != ExpectedShuffleMaskHalf) + if (!isa(SVInst->getOperand(1))) return false; - ExpectedShuffleMaskHalf *= 2; + ArrayRef CurMask = SVInst->getShuffleMask(); + + // Subtract the parity mask when checking the condition. for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { - if (Mask < ShuffleMaskHalf && CurMask[Mask] != ShuffleMaskHalf + Mask) + if (Mask < ShuffleMaskHalf && + CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1)) return false; if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) return false; } + + // Update mask values. ShuffleMaskHalf *= 2; - if (ExpectedShuffleMaskHalf == VecSize) + ShuffleMaskHalf -= (ExpectedParityMask & 1); + ExpectedParityMask >>= 1; + + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SVInst->getType(), SVInst->getType(), + CurMask, CostKind); + + VisitedCnt += 1; + if (!ExpectedParityMask && VisitedCnt == NumLevels) break; - ShouldBeCallInst ^= 1; + + ShouldBeCallOrBinInst ^= 1; } else { return false; } } - if (ShouldBeCallInst) + // Pattern should end with a shuffle op. + if (ShouldBeCallOrBinInst) return false; - assert(VecSize != -1 && ExpectedShuffleMaskHalf == VecSize && - "Expected Match for Vector Size and Mask Half"); + assert(VecSize != -1 && "Expected Match for Vector Size"); Value *FinalVecV = PrevVecV[0]; - auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); - if (!InitEEV || !FinalVecV) return false; + auto *FinalVecVTy = dyn_cast(FinalVecV->getType()); + assert(FinalVecVTy && "Expected non-null value for Vector Type"); Intrinsic::ID ReducedOp = 0; - switch (CommonOp) { - case Intrinsic::umin: - ReducedOp = Intrinsic::vector_reduce_umin; - break; - case Intrinsic::umax: - ReducedOp = Intrinsic::vector_reduce_umax; - break; - case Intrinsic::smin: - ReducedOp = Intrinsic::vector_reduce_smin; - break; - case Intrinsic::smax: - ReducedOp = Intrinsic::vector_reduce_smax; - break; - default: - return false; - } - - InstructionCost OrigCost = 0; - unsigned int NumLevels = Log2_64(VecSize); - - for (unsigned int Level = 0; Level < NumLevels; ++Level) { - OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, - FinalVecVTy, FinalVecVTy); - OrigCost += TTI.getArithmeticInstrCost(Instruction::ICmp, FinalVecVTy); + if (CommonCallOp) { + switch (CommonCallOp) { + case Intrinsic::umin: + ReducedOp = Intrinsic::vector_reduce_umin; + break; + case Intrinsic::umax: + ReducedOp = Intrinsic::vector_reduce_umax; + break; + case Intrinsic::smin: + ReducedOp = Intrinsic::vector_reduce_smin; + break; + case Intrinsic::smax: + ReducedOp = Intrinsic::vector_reduce_smax; + break; + default: + return false; + } + } else if (CommonBinOp != Instruction::BinaryOpsEnd) { + ReducedOp = getReductionForBinop(CommonBinOp); + if (!ReducedOp) + return false; } - OrigCost += TTI.getVectorInstrCost(Instruction::ExtractElement, FinalVecVTy, - CostKind, 0); IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll index 3cb25ba4ecce6..403ce33b5344e 100644 --- a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -17,6 +17,52 @@ define i16 @test_reduce_v8i16(<8 x i16> %a0) { ret i16 %7 } +define i16 @test_reduce_v7i16_or(<7 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v7i16_or( +; CHECK-SAME: <7 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> + %2 = or <7 x i16> %a0, %1 + %3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> + %4 = or <7 x i16> %2, %3 + %5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> + %6 = or <7 x i16> %4, %5 + %7 = extractelement <7 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v3i16_and(<3 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v3i16_and( +; CHECK-SAME: <3 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> + %2 = and <3 x i16> %a0, %1 + %3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> + %4 = and <3 x i16> %2, %3 + %5 = extractelement <3 x i16> %4, i64 0 + ret i16 %5 +} + +define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +} + define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { ; CHECK-LABEL: define i16 @test_reduce_v8i16_2( ; CHECK-SAME: <8 x i16> [[A0:%.*]]) { @@ -125,3 +171,25 @@ define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { %8 = extractelement <8 x i16> %7, i64 0 ret i16 %8 } + +define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +}