From 27569450c58c82fd717007da9510944993660a04 Mon Sep 17 00:00:00 2001 From: Lauren Date: Tue, 1 Jul 2025 02:55:20 -0400 Subject: [PATCH 1/7] [VectorCombine] Expand `vector_insert` into shufflevector for earlier cost optimizations (#145512) Move folding logic from `InstCombineCalls` to `VectorCombine` to ensure `vector_insert` intrinsics are expanded into shufflevector instructions before cost-based shuffle optimizations run. Canonicalizes fixed-width vectors only. --- .../InstCombine/InstCombineCalls.cpp | 46 ------------ .../Transforms/Vectorize/VectorCombine.cpp | 71 +++++++++++++++++++ .../VectorCombine/fold-vector-insert.ll | 71 +++++++++++++++++++ 3 files changed, 142 insertions(+), 46 deletions(-) create mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 5b398d3b75f59..df29024a86f67 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3462,52 +3462,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } break; } - case Intrinsic::vector_insert: { - Value *Vec = II->getArgOperand(0); - Value *SubVec = II->getArgOperand(1); - Value *Idx = II->getArgOperand(2); - auto *DstTy = dyn_cast(II->getType()); - auto *VecTy = dyn_cast(Vec->getType()); - auto *SubVecTy = dyn_cast(SubVec->getType()); - - // Only canonicalize if the destination vector, Vec, and SubVec are all - // fixed vectors. - if (DstTy && VecTy && SubVecTy) { - unsigned DstNumElts = DstTy->getNumElements(); - unsigned VecNumElts = VecTy->getNumElements(); - unsigned SubVecNumElts = SubVecTy->getNumElements(); - unsigned IdxN = cast(Idx)->getZExtValue(); - - // An insert that entirely overwrites Vec with SubVec is a nop. - if (VecNumElts == SubVecNumElts) - return replaceInstUsesWith(CI, SubVec); - - // Widen SubVec into a vector of the same width as Vec, since - // shufflevector requires the two input vectors to be the same width. - // Elements beyond the bounds of SubVec within the widened vector are - // undefined. - SmallVector WidenMask; - unsigned i; - for (i = 0; i != SubVecNumElts; ++i) - WidenMask.push_back(i); - for (; i != VecNumElts; ++i) - WidenMask.push_back(PoisonMaskElem); - - Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); - - SmallVector Mask; - for (unsigned i = 0; i != IdxN; ++i) - Mask.push_back(i); - for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i) - Mask.push_back(i); - for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i) - Mask.push_back(i); - - Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); - return replaceInstUsesWith(CI, Shuffle); - } - break; - } case Intrinsic::vector_extract: { Value *Vec = II->getArgOperand(0); Value *Idx = II->getArgOperand(1); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 19e82099e87f0..dbbc6c5a07ec8 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -112,6 +112,7 @@ class VectorCombine { bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); bool foldInsExtBinop(Instruction &I); + bool foldVectorInsertToShuffle(Instruction &I); bool foldInsExtVectorToShuffle(Instruction &I); bool foldBitOpOfBitcasts(Instruction &I); bool foldBitcastShuffle(Instruction &I); @@ -804,6 +805,73 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) { return true; } +/// Try to fold vector_insert intrinsics into shufflevector instructions. +bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { + auto *II = dyn_cast(&I); + // This optimization only applies to vector_insert intrinsics. + if (!II || II->getIntrinsicID() != Intrinsic::vector_insert) + return false; + + Value *Vec = II->getArgOperand(0); + Value *SubVec = II->getArgOperand(1); + Value *Idx = II->getArgOperand(2); + + // Caller guarantees DstTy is a fixed vector. + auto *DstTy = cast(II->getType()); + auto *VecTy = dyn_cast(Vec->getType()); + auto *SubVecTy = dyn_cast(SubVec->getType()); + + // Only canonicalize if Vec and SubVec are both fixed vectors. + if (!VecTy || !SubVecTy) + return false; + + unsigned DstNumElts = DstTy->getNumElements(); + unsigned VecNumElts = VecTy->getNumElements(); + unsigned SubVecNumElts = SubVecTy->getNumElements(); + auto *SubVecPtr = dyn_cast(Idx); + if (!SubVecPtr) + return false; + + unsigned SubVecIdx = SubVecPtr->getZExtValue(); + + // Ensure insertion of SubVec doesn't exceed Dst bounds. + if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts) + return false; + + // An insert that entirely overwrites Vec with SubVec is a nop. + if (VecNumElts == SubVecNumElts) { + replaceValue(I, *SubVec); + return true; + } + + // Widen SubVec into a vector of the same width as Vec, since + // shufflevector requires the two input vectors to be the same width. + // Elements beyond the bounds of SubVec within the widened vector are + // undefined. + SmallVector WidenMask; + unsigned int i = 0; + for (i = 0; i != SubVecNumElts; ++i) + WidenMask.push_back(i); + for (; i != VecNumElts; ++i) + WidenMask.push_back(PoisonMaskElem); + + auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); + Worklist.pushValue(WidenShuffle); + + SmallVector Mask; + unsigned int j; + for (i = 0; i != SubVecIdx; ++i) + Mask.push_back(i); + for (j = 0; j != SubVecNumElts; ++j) + Mask.push_back(DstNumElts + j); + for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i) + Mask.push_back(i); + + auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); + replaceValue(I, *Shuffle); + return true; +} + bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) { // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y)) Value *LHSSrc, *RHSSrc; @@ -3639,6 +3707,9 @@ bool VectorCombine::run() { // dispatching to folding functions if there's no chance of matching. if (IsFixedVectorType) { switch (Opcode) { + case Instruction::Call: + MadeChange |= foldVectorInsertToShuffle(I); + break; case Instruction::InsertElement: MadeChange |= vectorizeLoadInsert(I); break; diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll new file mode 100644 index 0000000000000..976fdb322005b --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64) +declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64) +declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64) +declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64) +declare @llvm.vector.insert.nxv4i32.v2i32(, <2 x i32>, i64) + +define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: define <8 x i32> @vector_insert_begin( +; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0) + ret <8 x i32> %result +} + +define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: define <8 x i32> @vector_insert_middle( +; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) + ret <8 x i32> %result +} + +define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: define <8 x i32> @vector_insert_end( +; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4) + ret <8 x i32> %result +} + +define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) { +; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite( +; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: ret <8 x i32> [[SUBVEC]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0) + ret <8 x i32> %result +} + +define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) { +; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end( +; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7) + ret <8 x i32> %result +} + +define @vector_insert_no_fold_scalable( %vec, <2 x i32> %subvec) { +; CHECK-LABEL: define @vector_insert_no_fold_scalable( +; CHECK-SAME: [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) { +; CHECK-NEXT: [[RESULT:%.*]] = call @llvm.vector.insert.nxv4i32.v2i32( [[VEC]], <2 x i32> [[SUBVEC]], i64 0) +; CHECK-NEXT: ret [[RESULT]] +; + %result = call @llvm.vector.insert.nxv4i32.v2i32( %vec, <2 x i32> %subvec, i64 0) + ret %result +} From fc894c8d658554dbb04e5a99c3c8f1aeef986220 Mon Sep 17 00:00:00 2001 From: Lauren Date: Tue, 1 Jul 2025 04:52:10 -0400 Subject: [PATCH 2/7] [VectorCombine] Move canonicalize-vector-insert tests from InstCombine to VectorCombine --- .../canonicalize-vector-insert.ll | 19 +++-- .../VectorCombine/fold-vector-insert.ll | 71 ------------------- 2 files changed, 15 insertions(+), 75 deletions(-) rename llvm/test/Transforms/{InstCombine => VectorCombine}/canonicalize-vector-insert.ll (84%) delete mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll similarity index 84% rename from llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll rename to llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll index ab7a50e55db0f..af6fe52c07920 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll +++ b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -passes=instcombine -S | FileCheck %s +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s ; llvm.vector.insert canonicalizes to shufflevector in the fixed case. In the ; scalable case, we lower to the INSERT_SUBVECTOR ISD node. @@ -31,7 +31,7 @@ define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) { define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) { ; CHECK-LABEL: @valid_insertion_a( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP2]] ; %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0) @@ -71,7 +71,7 @@ define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) { define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) { ; CHECK-LABEL: @valid_insertion_e( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP2]] ; %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0) @@ -91,7 +91,7 @@ define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) { define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) { ; CHECK-LABEL: @valid_insertion_g( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> ; CHECK-NEXT: ret <8 x i32> [[TMP2]] ; %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0) @@ -108,6 +108,17 @@ define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) { ret <8 x i32> %1 } +; Tests insertion at middle index +define <8 x i32> @valid_insertion_i(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_i( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) + ret <8 x i32> %result +} + ; ============================================================================ ; ; Scalable cases ; ============================================================================ ; diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll deleted file mode 100644 index 976fdb322005b..0000000000000 --- a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll +++ /dev/null @@ -1,71 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -passes=vector-combine -S | FileCheck %s - -declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64) -declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64) -declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64) -declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64) -declare @llvm.vector.insert.nxv4i32.v2i32(, <2 x i32>, i64) - -define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) { -; CHECK-LABEL: define <8 x i32> @vector_insert_begin( -; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[RESULT]] -; - %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0) - ret <8 x i32> %result -} - -define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) { -; CHECK-LABEL: define <8 x i32> @vector_insert_middle( -; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[RESULT]] -; - %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) - ret <8 x i32> %result -} - -define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) { -; CHECK-LABEL: define <8 x i32> @vector_insert_end( -; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[RESULT]] -; - %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4) - ret <8 x i32> %result -} - -define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) { -; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite( -; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: ret <8 x i32> [[SUBVEC]] -; - %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0) - ret <8 x i32> %result -} - -define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) { -; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end( -; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> -; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> -; CHECK-NEXT: ret <8 x i32> [[RESULT]] -; - %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7) - ret <8 x i32> %result -} - -define @vector_insert_no_fold_scalable( %vec, <2 x i32> %subvec) { -; CHECK-LABEL: define @vector_insert_no_fold_scalable( -; CHECK-SAME: [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) { -; CHECK-NEXT: [[RESULT:%.*]] = call @llvm.vector.insert.nxv4i32.v2i32( [[VEC]], <2 x i32> [[SUBVEC]], i64 0) -; CHECK-NEXT: ret [[RESULT]] -; - %result = call @llvm.vector.insert.nxv4i32.v2i32( %vec, <2 x i32> %subvec, i64 0) - ret %result -} From 78a18d03e9fca922403550762369a3c2202ba7fb Mon Sep 17 00:00:00 2001 From: Lauren Date: Tue, 1 Jul 2025 11:44:43 -0400 Subject: [PATCH 3/7] [VectorCombine] Use std::iota for shuffle mask construction --- .../Transforms/Vectorize/VectorCombine.cpp | 21 +++++++------------ 1 file changed, 7 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index dbbc6c5a07ec8..55c320103afb2 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -848,24 +848,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { // shufflevector requires the two input vectors to be the same width. // Elements beyond the bounds of SubVec within the widened vector are // undefined. - SmallVector WidenMask; - unsigned int i = 0; - for (i = 0; i != SubVecNumElts; ++i) - WidenMask.push_back(i); - for (; i != VecNumElts; ++i) - WidenMask.push_back(PoisonMaskElem); + SmallVector WidenMask(VecNumElts, PoisonMaskElem); + std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0); + std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem); auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); Worklist.pushValue(WidenShuffle); - SmallVector Mask; - unsigned int j; - for (i = 0; i != SubVecIdx; ++i) - Mask.push_back(i); - for (j = 0; j != SubVecNumElts; ++j) - Mask.push_back(DstNumElts + j); - for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i) - Mask.push_back(i); + SmallVector Mask(DstNumElts); + std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0); + std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts); + std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts); auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); replaceValue(I, *Shuffle); From f8588d501aeb649a3cb2670fde66cefc17a17c8f Mon Sep 17 00:00:00 2001 From: Lauren Date: Tue, 1 Jul 2025 13:22:04 -0400 Subject: [PATCH 4/7] [VectorCombine] Remove redundant `fill` and reduce three loops to two `iota` calls --- llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 55c320103afb2..46a908420c5e2 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -835,7 +835,8 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { unsigned SubVecIdx = SubVecPtr->getZExtValue(); // Ensure insertion of SubVec doesn't exceed Dst bounds. - if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts) + if ((SubVecIdx % SubVecNumElts != 0) || + (SubVecIdx + SubVecNumElts > DstNumElts)) return false; // An insert that entirely overwrites Vec with SubVec is a nop. @@ -850,18 +851,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { // undefined. SmallVector WidenMask(VecNumElts, PoisonMaskElem); std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0); - std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem); auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); Worklist.pushValue(WidenShuffle); SmallVector Mask(DstNumElts); - std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0); - std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts); - std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts); + std::iota(Mask.begin(), Mask.end(), 0); + std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, + DstNumElts); - auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); - replaceValue(I, *Shuffle); + auto *InsertShuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); + replaceValue(I, *InsertShuffle); return true; } From 91989bbd838e1835f149fd0ddea200d5487f3348 Mon Sep 17 00:00:00 2001 From: Lauren Date: Tue, 1 Jul 2025 13:22:04 -0400 Subject: [PATCH 5/7] [VectorCombine] Remove redundant `fill` and reduce three loops to two `iota` calls --- .../Transforms/PhaseOrdering/X86/fmaddsub.ll | 37 ++++++++----------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll index 5cb2c4530aa57..8e25c9c5547d6 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll @@ -567,22 +567,19 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, ; ; SSE4-LABEL: @buildvector_mul_subadd_ps256( ; SSE4-NEXT: [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]] -; SSE4-NEXT: [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]] -; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> -; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; SSE4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]] ; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> -; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> -; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> -; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> +; SSE4-NEXT: [[TMP5:%.*]] = fsub <8 x float> [[A]], [[B]] +; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> +; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> ; SSE4-NEXT: ret <8 x float> [[TMP6]] ; ; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256( ; AVX_FMA4-NEXT: [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]] -; AVX_FMA4-NEXT: [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]] -; AVX_FMA4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> -; AVX_FMA4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]] +; AVX_FMA4-NEXT: [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]] ; AVX_FMA4-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> -; AVX_FMA4-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> +; AVX_FMA4-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[A]], [[B]] +; AVX_FMA4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> ; AVX_FMA4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> ; AVX_FMA4-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> ; AVX_FMA4-NEXT: ret <8 x float> [[TMP6]] @@ -677,13 +674,11 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> ; ; AVX_FMA-LABEL: @buildvector_mul_subadd_ps512( ; AVX_FMA-NEXT: [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]] -; AVX_FMA-NEXT: [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]] -; AVX_FMA-NEXT: [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> -; AVX_FMA-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]] +; AVX_FMA-NEXT: [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B:%.*]] ; AVX_FMA-NEXT: [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> -; AVX_FMA-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> -; AVX_FMA-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP5]], <16 x i32> -; AVX_FMA-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <16 x i32> +; AVX_FMA-NEXT: [[TMP5:%.*]] = fsub <16 x float> [[A]], [[B]] +; AVX_FMA-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <16 x i32> +; AVX_FMA-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x i32> ; AVX_FMA-NEXT: ret <16 x float> [[TMP7]] ; ; AVX512-LABEL: @buildvector_mul_subadd_ps512( @@ -880,13 +875,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> ; ; AVX_FMA-LABEL: @buildvector_mul_subadd_pd512( ; AVX_FMA-NEXT: [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]] -; AVX_FMA-NEXT: [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]] -; AVX_FMA-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> -; AVX_FMA-NEXT: [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]] +; AVX_FMA-NEXT: [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B:%.*]] ; AVX_FMA-NEXT: [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> -; AVX_FMA-NEXT: [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <8 x i32> -; AVX_FMA-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP5]], <8 x i32> -; AVX_FMA-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> +; AVX_FMA-NEXT: [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]] +; AVX_FMA-NEXT: [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <8 x i32> +; AVX_FMA-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP6]], <8 x i32> ; AVX_FMA-NEXT: ret <8 x double> [[TMP7]] ; ; AVX512-LABEL: @buildvector_mul_subadd_pd512( From 6855aad254f6885757344d430c389044affd2cf9 Mon Sep 17 00:00:00 2001 From: Lauren Chin Date: Thu, 10 Jul 2025 00:45:33 -0400 Subject: [PATCH 6/7] Revert deletion of InstCombinerImpl::visitCallInst vector_insert in InstCombineCalls.cpp --- .../InstCombine/InstCombineCalls.cpp | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index df29024a86f67..5b398d3b75f59 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3462,6 +3462,52 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { } break; } + case Intrinsic::vector_insert: { + Value *Vec = II->getArgOperand(0); + Value *SubVec = II->getArgOperand(1); + Value *Idx = II->getArgOperand(2); + auto *DstTy = dyn_cast(II->getType()); + auto *VecTy = dyn_cast(Vec->getType()); + auto *SubVecTy = dyn_cast(SubVec->getType()); + + // Only canonicalize if the destination vector, Vec, and SubVec are all + // fixed vectors. + if (DstTy && VecTy && SubVecTy) { + unsigned DstNumElts = DstTy->getNumElements(); + unsigned VecNumElts = VecTy->getNumElements(); + unsigned SubVecNumElts = SubVecTy->getNumElements(); + unsigned IdxN = cast(Idx)->getZExtValue(); + + // An insert that entirely overwrites Vec with SubVec is a nop. + if (VecNumElts == SubVecNumElts) + return replaceInstUsesWith(CI, SubVec); + + // Widen SubVec into a vector of the same width as Vec, since + // shufflevector requires the two input vectors to be the same width. + // Elements beyond the bounds of SubVec within the widened vector are + // undefined. + SmallVector WidenMask; + unsigned i; + for (i = 0; i != SubVecNumElts; ++i) + WidenMask.push_back(i); + for (; i != VecNumElts; ++i) + WidenMask.push_back(PoisonMaskElem); + + Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); + + SmallVector Mask; + for (unsigned i = 0; i != IdxN; ++i) + Mask.push_back(i); + for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i) + Mask.push_back(i); + for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i) + Mask.push_back(i); + + Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); + return replaceInstUsesWith(CI, Shuffle); + } + break; + } case Intrinsic::vector_extract: { Value *Vec = II->getArgOperand(0); Value *Idx = II->getArgOperand(1); From fe0287ff6783da063fda378b494ba1358d205015 Mon Sep 17 00:00:00 2001 From: Lauren Chin Date: Thu, 10 Jul 2025 06:05:04 -0400 Subject: [PATCH 7/7] [InstCombine] Restore and extend test for vector_insert. Refactor fold logic in InstCombinerImpl::visitCallInst --- .../InstCombine/InstCombineCalls.cpp | 23 ++- .../Transforms/Vectorize/VectorCombine.cpp | 7 +- .../InstCombine/canonicalize-vector-insert.ll | 135 ++++++++++++++++++ 3 files changed, 148 insertions(+), 17 deletions(-) create mode 100644 llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp index 5b398d3b75f59..5d76ee5a01dce 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -72,6 +72,7 @@ #include #include #include +#include #include #include #include @@ -3478,6 +3479,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { unsigned SubVecNumElts = SubVecTy->getNumElements(); unsigned IdxN = cast(Idx)->getZExtValue(); + if ((IdxN % SubVecNumElts != 0) || (IdxN + SubVecNumElts > DstNumElts)) + return II; + // An insert that entirely overwrites Vec with SubVec is a nop. if (VecNumElts == SubVecNumElts) return replaceInstUsesWith(CI, SubVec); @@ -3486,22 +3490,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) { // shufflevector requires the two input vectors to be the same width. // Elements beyond the bounds of SubVec within the widened vector are // undefined. - SmallVector WidenMask; - unsigned i; - for (i = 0; i != SubVecNumElts; ++i) - WidenMask.push_back(i); - for (; i != VecNumElts; ++i) - WidenMask.push_back(PoisonMaskElem); + SmallVector WidenMask(VecNumElts, PoisonMaskElem); + std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0); Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask); - SmallVector Mask; - for (unsigned i = 0; i != IdxN; ++i) - Mask.push_back(i); - for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i) - Mask.push_back(i); - for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i) - Mask.push_back(i); + SmallVector Mask(DstNumElts); + std::iota(Mask.begin(), Mask.end(), 0); + std::iota(Mask.begin() + IdxN, Mask.begin() + IdxN + SubVecNumElts, + DstNumElts); Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); return replaceInstUsesWith(CI, Shuffle); diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 46a908420c5e2..d1ecb8eebbcbb 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -832,11 +832,10 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { if (!SubVecPtr) return false; - unsigned SubVecIdx = SubVecPtr->getZExtValue(); + unsigned IdxN = SubVecPtr->getZExtValue(); // Ensure insertion of SubVec doesn't exceed Dst bounds. - if ((SubVecIdx % SubVecNumElts != 0) || - (SubVecIdx + SubVecNumElts > DstNumElts)) + if ((IdxN % SubVecNumElts != 0) || (IdxN + SubVecNumElts > DstNumElts)) return false; // An insert that entirely overwrites Vec with SubVec is a nop. @@ -857,7 +856,7 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) { SmallVector Mask(DstNumElts); std::iota(Mask.begin(), Mask.end(), 0); - std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, + std::iota(Mask.begin() + IdxN, Mask.begin() + IdxN + SubVecNumElts, DstNumElts); auto *InsertShuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask); diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll new file mode 100644 index 0000000000000..ee8a97504dbe0 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll @@ -0,0 +1,135 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes=instcombine -S | FileCheck %s + +; llvm.vector.insert canonicalizes to shufflevector in the fixed case. In the +; scalable case, we lower to the INSERT_SUBVECTOR ISD node. + +declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 %idx) +declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 %idx) +declare @llvm.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 %idx) + +; ============================================================================ ; +; Trivial cases +; ============================================================================ ; + +; An insert that entirely overwrites an with another is a +; nop. +define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) { +; CHECK-LABEL: @trivial_nop( +; CHECK-NEXT: ret <8 x i32> [[SUBVEC:%.*]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +; ============================================================================ ; +; Valid canonicalizations +; ============================================================================ ; + +define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_a( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_b(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_b( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_c(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_c( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 4) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_d( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 6) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_e( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_f( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_g( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_h( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[TMP2]] +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 3) + ret <8 x i32> %1 +} + +; Tests insertion at middle index +define <8 x i32> @valid_insertion_i(<8 x i32> %vec, <2 x i32> %subvec) { +; CHECK-LABEL: @valid_insertion_i( +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: ret <8 x i32> [[RESULT]] +; + %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2) + ret <8 x i32> %result +} + +; ============================================================================ ; +; Scalable cases +; ============================================================================ ; + +; Scalable insertions should not be canonicalized. This will be lowered to the +; INSERT_SUBVECTOR ISD node later. +define @scalable_insert( %vec, <4 x i32> %subvec) { +; CHECK-LABEL: @scalable_insert( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP1]] +; + %1 = call @llvm.vector.insert.nxv4i32.v4i32( %vec, <4 x i32> %subvec, i64 0) + ret %1 +}