From 27569450c58c82fd717007da9510944993660a04 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 02:55:20 -0400
Subject: [PATCH 1/7] [VectorCombine] Expand `vector_insert` into shufflevector
 for earlier cost optimizations (#145512)

Move folding logic from `InstCombineCalls` to `VectorCombine` to ensure `vector_insert` intrinsics are expanded into shufflevector instructions before cost-based shuffle optimizations run.

Canonicalizes fixed-width vectors only.
---
 .../InstCombine/InstCombineCalls.cpp          | 46 ------------
 .../Transforms/Vectorize/VectorCombine.cpp    | 71 +++++++++++++++++++
 .../VectorCombine/fold-vector-insert.ll       | 71 +++++++++++++++++++
 3 files changed, 142 insertions(+), 46 deletions(-)
 create mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5b398d3b75f59..df29024a86f67 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3462,52 +3462,6 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
-  case Intrinsic::vector_insert: {
-    Value *Vec = II->getArgOperand(0);
-    Value *SubVec = II->getArgOperand(1);
-    Value *Idx = II->getArgOperand(2);
-    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
-    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
-    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
-
-    // Only canonicalize if the destination vector, Vec, and SubVec are all
-    // fixed vectors.
-    if (DstTy && VecTy && SubVecTy) {
-      unsigned DstNumElts = DstTy->getNumElements();
-      unsigned VecNumElts = VecTy->getNumElements();
-      unsigned SubVecNumElts = SubVecTy->getNumElements();
-      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
-
-      // An insert that entirely overwrites Vec with SubVec is a nop.
-      if (VecNumElts == SubVecNumElts)
-        return replaceInstUsesWith(CI, SubVec);
-
-      // Widen SubVec into a vector of the same width as Vec, since
-      // shufflevector requires the two input vectors to be the same width.
-      // Elements beyond the bounds of SubVec within the widened vector are
-      // undefined.
-      SmallVector<int, 8> WidenMask;
-      unsigned i;
-      for (i = 0; i != SubVecNumElts; ++i)
-        WidenMask.push_back(i);
-      for (; i != VecNumElts; ++i)
-        WidenMask.push_back(PoisonMaskElem);
-
-      Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
-
-      SmallVector<int, 8> Mask;
-      for (unsigned i = 0; i != IdxN; ++i)
-        Mask.push_back(i);
-      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
-        Mask.push_back(i);
-      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
-        Mask.push_back(i);
-
-      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
-      return replaceInstUsesWith(CI, Shuffle);
-    }
-    break;
-  }
   case Intrinsic::vector_extract: {
     Value *Vec = II->getArgOperand(0);
     Value *Idx = II->getArgOperand(1);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 19e82099e87f0..dbbc6c5a07ec8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -112,6 +112,7 @@ class VectorCombine {
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
   bool foldInsExtBinop(Instruction &I);
+  bool foldVectorInsertToShuffle(Instruction &I);
   bool foldInsExtVectorToShuffle(Instruction &I);
   bool foldBitOpOfBitcasts(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
@@ -804,6 +805,73 @@ bool VectorCombine::foldInsExtBinop(Instruction &I) {
   return true;
 }
 
+/// Try to fold vector_insert intrinsics into shufflevector instructions.
+bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
+  auto *II = dyn_cast<IntrinsicInst>(&I);
+  // This optimization only applies to vector_insert intrinsics.
+  if (!II || II->getIntrinsicID() != Intrinsic::vector_insert)
+    return false;
+
+  Value *Vec = II->getArgOperand(0);
+  Value *SubVec = II->getArgOperand(1);
+  Value *Idx = II->getArgOperand(2);
+
+  // Caller guarantees DstTy is a fixed vector.
+  auto *DstTy = cast<FixedVectorType>(II->getType());
+  auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+  auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+  // Only canonicalize if Vec and SubVec are both fixed vectors.
+  if (!VecTy || !SubVecTy)
+    return false;
+
+  unsigned DstNumElts = DstTy->getNumElements();
+  unsigned VecNumElts = VecTy->getNumElements();
+  unsigned SubVecNumElts = SubVecTy->getNumElements();
+  auto *SubVecPtr = dyn_cast<ConstantInt>(Idx);
+  if (!SubVecPtr)
+    return false;
+
+  unsigned SubVecIdx = SubVecPtr->getZExtValue();
+
+  // Ensure insertion of SubVec doesn't exceed Dst bounds.
+  if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+    return false;
+
+  // An insert that entirely overwrites Vec with SubVec is a nop.
+  if (VecNumElts == SubVecNumElts) {
+    replaceValue(I, *SubVec);
+    return true;
+  }
+
+  // Widen SubVec into a vector of the same width as Vec, since
+  // shufflevector requires the two input vectors to be the same width.
+  // Elements beyond the bounds of SubVec within the widened vector are
+  // undefined.
+  SmallVector<int, 8> WidenMask;
+  unsigned int i = 0;
+  for (i = 0; i != SubVecNumElts; ++i)
+    WidenMask.push_back(i);
+  for (; i != VecNumElts; ++i)
+    WidenMask.push_back(PoisonMaskElem);
+
+  auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+  Worklist.pushValue(WidenShuffle);
+
+  SmallVector<int, 8> Mask;
+  unsigned int j;
+  for (i = 0; i != SubVecIdx; ++i)
+    Mask.push_back(i);
+  for (j = 0; j != SubVecNumElts; ++j)
+    Mask.push_back(DstNumElts + j);
+  for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i)
+    Mask.push_back(i);
+
+  auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+  replaceValue(I, *Shuffle);
+  return true;
+}
+
 bool VectorCombine::foldBitOpOfBitcasts(Instruction &I) {
   // Match: bitop(bitcast(x), bitcast(y)) -> bitcast(bitop(x, y))
   Value *LHSSrc, *RHSSrc;
@@ -3639,6 +3707,9 @@ bool VectorCombine::run() {
     // dispatching to folding functions if there's no chance of matching.
     if (IsFixedVectorType) {
       switch (Opcode) {
+      case Instruction::Call:
+        MadeChange |= foldVectorInsertToShuffle(I);
+        break;
       case Instruction::InsertElement:
         MadeChange |= vectorizeLoadInsert(I);
         break;
diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
new file mode 100644
index 0000000000000..976fdb322005b
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
+
+declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64)
+declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32>, <2 x i32>, i64)
+
+define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_begin(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_middle(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    ret <8 x i32> [[SUBVEC]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+  ret <8 x i32> %result
+}
+
+define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) {
+; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end(
+; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7)
+  ret <8 x i32> %result
+}
+
+define <vscale x 4 x i32> @vector_insert_no_fold_scalable(<vscale x 4 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: define <vscale x 4 x i32> @vector_insert_no_fold_scalable(
+; CHECK-SAME: <vscale x 4 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
+; CHECK-NEXT:    [[RESULT:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> [[VEC]], <2 x i32> [[SUBVEC]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[RESULT]]
+;
+  %result = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> %vec, <2 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %result
+}

From fc894c8d658554dbb04e5a99c3c8f1aeef986220 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 04:52:10 -0400
Subject: [PATCH 2/7] [VectorCombine] Move canonicalize-vector-insert tests
 from InstCombine to VectorCombine

---
 .../canonicalize-vector-insert.ll             | 19 +++--
 .../VectorCombine/fold-vector-insert.ll       | 71 -------------------
 2 files changed, 15 insertions(+), 75 deletions(-)
 rename llvm/test/Transforms/{InstCombine => VectorCombine}/canonicalize-vector-insert.ll (84%)
 delete mode 100644 llvm/test/Transforms/VectorCombine/fold-vector-insert.ll

diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
similarity index 84%
rename from llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
rename to llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
index ab7a50e55db0f..af6fe52c07920 100644
--- a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
+++ b/llvm/test/Transforms/VectorCombine/canonicalize-vector-insert.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
 
 ; llvm.vector.insert canonicalizes to shufflevector in the fixed case. In the
 ; scalable case, we lower to the INSERT_SUBVECTOR ISD node.
@@ -31,7 +31,7 @@ define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) {
 define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_a(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0)
@@ -71,7 +71,7 @@ define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) {
 define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_e(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
@@ -91,7 +91,7 @@ define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) {
 define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) {
 ; CHECK-LABEL: @valid_insertion_g(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
 ;
   %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0)
@@ -108,6 +108,17 @@ define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) {
   ret <8 x i32> %1
 }
 
+; Tests insertion at middle index
+define <8 x i32> @valid_insertion_i(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_i(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %result
+}
+
 ; ============================================================================ ;
 ; Scalable cases
 ; ============================================================================ ;
diff --git a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll b/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
deleted file mode 100644
index 976fdb322005b..0000000000000
--- a/llvm/test/Transforms/VectorCombine/fold-vector-insert.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=vector-combine -S | FileCheck %s
-
-declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32>, <4 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32>, <8 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32>, <2 x i32>, i64)
-declare <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32>, <1 x i32>, i64)
-declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32>, <2 x i32>, i64)
-
-define <8 x i32> @vector_insert_begin(<8 x i32> %vec, <4 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_begin(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_middle(<8 x i32> %vec, <2 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_middle(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_end(<8 x i32> %vec, <4 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_end(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_overwrite(<8 x i32> %vec, <8 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_overwrite(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <8 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    ret <8 x i32> [[SUBVEC]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
-  ret <8 x i32> %result
-}
-
-define <8 x i32> @vector_insert_single_element_at_end(<8 x i32> %vec, <1 x i32> %subvec) {
-; CHECK-LABEL: define <8 x i32> @vector_insert_single_element_at_end(
-; CHECK-SAME: <8 x i32> [[VEC:%.*]], <1 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <1 x i32> [[SUBVEC]], <1 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 8>
-; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
-;
-  %result = call <8 x i32> @llvm.vector.insert.v8i32.v1i32(<8 x i32> %vec, <1 x i32> %subvec, i64 7)
-  ret <8 x i32> %result
-}
-
-define <vscale x 4 x i32> @vector_insert_no_fold_scalable(<vscale x 4 x i32> %vec, <2 x i32> %subvec) {
-; CHECK-LABEL: define <vscale x 4 x i32> @vector_insert_no_fold_scalable(
-; CHECK-SAME: <vscale x 4 x i32> [[VEC:%.*]], <2 x i32> [[SUBVEC:%.*]]) {
-; CHECK-NEXT:    [[RESULT:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> [[VEC]], <2 x i32> [[SUBVEC]], i64 0)
-; CHECK-NEXT:    ret <vscale x 4 x i32> [[RESULT]]
-;
-  %result = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v2i32(<vscale x 4 x i32> %vec, <2 x i32> %subvec, i64 0)
-  ret <vscale x 4 x i32> %result
-}

From 78a18d03e9fca922403550762369a3c2202ba7fb Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 11:44:43 -0400
Subject: [PATCH 3/7] [VectorCombine] Use std::iota for shuffle mask
 construction

---
 .../Transforms/Vectorize/VectorCombine.cpp    | 21 +++++++------------
 1 file changed, 7 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index dbbc6c5a07ec8..55c320103afb2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -848,24 +848,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   // shufflevector requires the two input vectors to be the same width.
   // Elements beyond the bounds of SubVec within the widened vector are
   // undefined.
-  SmallVector<int, 8> WidenMask;
-  unsigned int i = 0;
-  for (i = 0; i != SubVecNumElts; ++i)
-    WidenMask.push_back(i);
-  for (; i != VecNumElts; ++i)
-    WidenMask.push_back(PoisonMaskElem);
+  SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
+  std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
+  std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem);
 
   auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
   Worklist.pushValue(WidenShuffle);
 
-  SmallVector<int, 8> Mask;
-  unsigned int j;
-  for (i = 0; i != SubVecIdx; ++i)
-    Mask.push_back(i);
-  for (j = 0; j != SubVecNumElts; ++j)
-    Mask.push_back(DstNumElts + j);
-  for (i = SubVecIdx + SubVecNumElts; i != DstNumElts; ++i)
-    Mask.push_back(i);
+  SmallVector<int, 8> Mask(DstNumElts);
+  std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0);
+  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts);
+  std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts);
 
   auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
   replaceValue(I, *Shuffle);

From f8588d501aeb649a3cb2670fde66cefc17a17c8f Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 13:22:04 -0400
Subject: [PATCH 4/7] [VectorCombine] Remove redundant `fill` and reduce three
 loops to two `iota`  calls

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 55c320103afb2..46a908420c5e2 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -835,7 +835,8 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   unsigned SubVecIdx = SubVecPtr->getZExtValue();
 
   // Ensure insertion of SubVec doesn't exceed Dst bounds.
-  if (SubVecIdx % SubVecNumElts != 0 || SubVecIdx + SubVecNumElts > DstNumElts)
+  if ((SubVecIdx % SubVecNumElts != 0) ||
+      (SubVecIdx + SubVecNumElts > DstNumElts))
     return false;
 
   // An insert that entirely overwrites Vec with SubVec is a nop.
@@ -850,18 +851,17 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   // undefined.
   SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
   std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
-  std::fill(WidenMask.begin() + SubVecNumElts, WidenMask.end(), PoisonMaskElem);
 
   auto *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
   Worklist.pushValue(WidenShuffle);
 
   SmallVector<int, 8> Mask(DstNumElts);
-  std::iota(Mask.begin(), Mask.begin() + SubVecIdx, 0);
-  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts, DstNumElts);
-  std::iota(Mask.begin() + SubVecIdx + SubVecNumElts, Mask.end(), SubVecIdx + SubVecNumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts,
+            DstNumElts);
 
-  auto *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
-  replaceValue(I, *Shuffle);
+  auto *InsertShuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+  replaceValue(I, *InsertShuffle);
   return true;
 }
 

From 91989bbd838e1835f149fd0ddea200d5487f3348 Mon Sep 17 00:00:00 2001
From: Lauren <lchin@berkeley.edu>
Date: Tue, 1 Jul 2025 13:22:04 -0400
Subject: [PATCH 5/7] [VectorCombine] Remove redundant `fill` and reduce three
 loops to two `iota`  calls

---
 .../Transforms/PhaseOrdering/X86/fmaddsub.ll  | 37 ++++++++-----------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 5cb2c4530aa57..8e25c9c5547d6 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -567,22 +567,19 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
 ;
 ; SSE4-LABEL: @buildvector_mul_subadd_ps256(
 ; SSE4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; SSE4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
-; SSE4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
 ; SSE4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; SSE4-NEXT:    [[TMP5:%.*]] = fsub <8 x float> [[A]], [[B]]
+; SSE4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; SSE4-NEXT:    ret <8 x float> [[TMP6]]
 ;
 ; AVX_FMA4-LABEL: @buildvector_mul_subadd_ps256(
 ; AVX_FMA4-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA4-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
-; AVX_FMA4-NEXT:    [[TMP1:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP2:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
 ; AVX_FMA4-NEXT:    [[TMP3:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA4-NEXT:    [[TMP7:%.*]] = fsub <8 x float> [[A]], [[B]]
+; AVX_FMA4-NEXT:    [[TMP4:%.*]] = shufflevector <8 x float> [[TMP7]], <8 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
 ; AVX_FMA4-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; AVX_FMA4-NEXT:    [[TMP6:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; AVX_FMA4-NEXT:    ret <8 x float> [[TMP6]]
@@ -677,13 +674,11 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_ps512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
-; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
 ; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <8 x float> [[TMP2]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP5]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP6]], <16 x float> poison, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = fsub <16 x float> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <16 x float> [[TMP4]], <16 x float> [[TMP6]], <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
 ; AVX_FMA-NEXT:    ret <16 x float> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_ps512(
@@ -880,13 +875,11 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
 ;
 ; AVX_FMA-LABEL: @buildvector_mul_subadd_pd512(
 ; AVX_FMA-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
-; AVX_FMA-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
-; AVX_FMA-NEXT:    [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP3:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
 ; AVX_FMA-NEXT:    [[TMP4:%.*]] = shufflevector <8 x double> [[TMP3]], <8 x double> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP5:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP5]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+; AVX_FMA-NEXT:    [[TMP5:%.*]] = fsub <8 x double> [[A]], [[B]]
+; AVX_FMA-NEXT:    [[TMP6:%.*]] = shufflevector <8 x double> [[TMP5]], <8 x double> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX_FMA-NEXT:    [[TMP7:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> [[TMP6]], <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
 ; AVX_FMA-NEXT:    ret <8 x double> [[TMP7]]
 ;
 ; AVX512-LABEL: @buildvector_mul_subadd_pd512(

From 6855aad254f6885757344d430c389044affd2cf9 Mon Sep 17 00:00:00 2001
From: Lauren Chin <lchin@uw.edu>
Date: Thu, 10 Jul 2025 00:45:33 -0400
Subject: [PATCH 6/7] Revert deletion of InstCombinerImpl::visitCallInst
 vector_insert in InstCombineCalls.cpp

---
 .../InstCombine/InstCombineCalls.cpp          | 46 +++++++++++++++++++
 1 file changed, 46 insertions(+)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index df29024a86f67..5b398d3b75f59 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -3462,6 +3462,52 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
     }
     break;
   }
+  case Intrinsic::vector_insert: {
+    Value *Vec = II->getArgOperand(0);
+    Value *SubVec = II->getArgOperand(1);
+    Value *Idx = II->getArgOperand(2);
+    auto *DstTy = dyn_cast<FixedVectorType>(II->getType());
+    auto *VecTy = dyn_cast<FixedVectorType>(Vec->getType());
+    auto *SubVecTy = dyn_cast<FixedVectorType>(SubVec->getType());
+
+    // Only canonicalize if the destination vector, Vec, and SubVec are all
+    // fixed vectors.
+    if (DstTy && VecTy && SubVecTy) {
+      unsigned DstNumElts = DstTy->getNumElements();
+      unsigned VecNumElts = VecTy->getNumElements();
+      unsigned SubVecNumElts = SubVecTy->getNumElements();
+      unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
+
+      // An insert that entirely overwrites Vec with SubVec is a nop.
+      if (VecNumElts == SubVecNumElts)
+        return replaceInstUsesWith(CI, SubVec);
+
+      // Widen SubVec into a vector of the same width as Vec, since
+      // shufflevector requires the two input vectors to be the same width.
+      // Elements beyond the bounds of SubVec within the widened vector are
+      // undefined.
+      SmallVector<int, 8> WidenMask;
+      unsigned i;
+      for (i = 0; i != SubVecNumElts; ++i)
+        WidenMask.push_back(i);
+      for (; i != VecNumElts; ++i)
+        WidenMask.push_back(PoisonMaskElem);
+
+      Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
+
+      SmallVector<int, 8> Mask;
+      for (unsigned i = 0; i != IdxN; ++i)
+        Mask.push_back(i);
+      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
+        Mask.push_back(i);
+      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
+        Mask.push_back(i);
+
+      Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
+      return replaceInstUsesWith(CI, Shuffle);
+    }
+    break;
+  }
   case Intrinsic::vector_extract: {
     Value *Vec = II->getArgOperand(0);
     Value *Idx = II->getArgOperand(1);

From fe0287ff6783da063fda378b494ba1358d205015 Mon Sep 17 00:00:00 2001
From: Lauren Chin <lchin@uw.edu>
Date: Thu, 10 Jul 2025 06:05:04 -0400
Subject: [PATCH 7/7] [InstCombine] Restore and extend test for vector_insert.
 Refactor fold logic in InstCombinerImpl::visitCallInst

---
 .../InstCombine/InstCombineCalls.cpp          |  23 ++-
 .../Transforms/Vectorize/VectorCombine.cpp    |   7 +-
 .../InstCombine/canonicalize-vector-insert.ll | 135 ++++++++++++++++++
 3 files changed, 148 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 5b398d3b75f59..5d76ee5a01dce 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -72,6 +72,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
+#include <numeric>
 #include <optional>
 #include <utility>
 #include <vector>
@@ -3478,6 +3479,9 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       unsigned SubVecNumElts = SubVecTy->getNumElements();
       unsigned IdxN = cast<ConstantInt>(Idx)->getZExtValue();
 
+      if ((IdxN % SubVecNumElts != 0) || (IdxN + SubVecNumElts > DstNumElts))
+        return II;
+
       // An insert that entirely overwrites Vec with SubVec is a nop.
       if (VecNumElts == SubVecNumElts)
         return replaceInstUsesWith(CI, SubVec);
@@ -3486,22 +3490,15 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
       // shufflevector requires the two input vectors to be the same width.
       // Elements beyond the bounds of SubVec within the widened vector are
       // undefined.
-      SmallVector<int, 8> WidenMask;
-      unsigned i;
-      for (i = 0; i != SubVecNumElts; ++i)
-        WidenMask.push_back(i);
-      for (; i != VecNumElts; ++i)
-        WidenMask.push_back(PoisonMaskElem);
+      SmallVector<int, 8> WidenMask(VecNumElts, PoisonMaskElem);
+      std::iota(WidenMask.begin(), WidenMask.begin() + SubVecNumElts, 0);
 
       Value *WidenShuffle = Builder.CreateShuffleVector(SubVec, WidenMask);
 
-      SmallVector<int, 8> Mask;
-      for (unsigned i = 0; i != IdxN; ++i)
-        Mask.push_back(i);
-      for (unsigned i = DstNumElts; i != DstNumElts + SubVecNumElts; ++i)
-        Mask.push_back(i);
-      for (unsigned i = IdxN + SubVecNumElts; i != DstNumElts; ++i)
-        Mask.push_back(i);
+      SmallVector<int, 8> Mask(DstNumElts);
+      std::iota(Mask.begin(), Mask.end(), 0);
+      std::iota(Mask.begin() + IdxN, Mask.begin() + IdxN + SubVecNumElts,
+                DstNumElts);
 
       Value *Shuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
       return replaceInstUsesWith(CI, Shuffle);
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 46a908420c5e2..d1ecb8eebbcbb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -832,11 +832,10 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
   if (!SubVecPtr)
     return false;
 
-  unsigned SubVecIdx = SubVecPtr->getZExtValue();
+  unsigned IdxN = SubVecPtr->getZExtValue();
 
   // Ensure insertion of SubVec doesn't exceed Dst bounds.
-  if ((SubVecIdx % SubVecNumElts != 0) ||
-      (SubVecIdx + SubVecNumElts > DstNumElts))
+  if ((IdxN % SubVecNumElts != 0) || (IdxN + SubVecNumElts > DstNumElts))
     return false;
 
   // An insert that entirely overwrites Vec with SubVec is a nop.
@@ -857,7 +856,7 @@ bool VectorCombine::foldVectorInsertToShuffle(Instruction &I) {
 
   SmallVector<int, 8> Mask(DstNumElts);
   std::iota(Mask.begin(), Mask.end(), 0);
-  std::iota(Mask.begin() + SubVecIdx, Mask.begin() + SubVecIdx + SubVecNumElts,
+  std::iota(Mask.begin() + IdxN, Mask.begin() + IdxN + SubVecNumElts,
             DstNumElts);
 
   auto *InsertShuffle = Builder.CreateShuffleVector(Vec, WidenShuffle, Mask);
diff --git a/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
new file mode 100644
index 0000000000000..ee8a97504dbe0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/canonicalize-vector-insert.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+
+; llvm.vector.insert canonicalizes to shufflevector in the fixed case. In the
+; scalable case, we lower to the INSERT_SUBVECTOR ISD node.
+
+declare <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+declare <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 %idx)
+declare <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 %idx)
+
+; ============================================================================ ;
+; Trivial cases
+; ============================================================================ ;
+
+; An insert that entirely overwrites an <n x ty> with another <n x ty> is a
+; nop.
+define <8 x i32> @trivial_nop(<8 x i32> %vec, <8 x i32> %subvec) {
+; CHECK-LABEL: @trivial_nop(
+; CHECK-NEXT:    ret <8 x i32> [[SUBVEC:%.*]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v8i32(<8 x i32> %vec, <8 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+; ============================================================================ ;
+; Valid canonicalizations
+; ============================================================================ ;
+
+define <8 x i32> @valid_insertion_a(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_a(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_b(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_b(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_c(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_c(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_d(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_d(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 6)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_e(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_e(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_f(<8 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_f(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[SUBVEC:%.*]], <4 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v4i32(<8 x i32> %vec, <4 x i32> %subvec, i64 4)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_g(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_g(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[VEC:%.*]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 0)
+  ret <8 x i32> %1
+}
+
+define <8 x i32> @valid_insertion_h(<8 x i32> %vec, <3 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_h(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <3 x i32> [[SUBVEC:%.*]], <3 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 8, i32 9, i32 10, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[TMP2]]
+;
+  %1 = call <8 x i32> @llvm.vector.insert.v8i32.v3i32(<8 x i32> %vec, <3 x i32> %subvec, i64 3)
+  ret <8 x i32> %1
+}
+
+; Tests insertion at middle index
+define <8 x i32> @valid_insertion_i(<8 x i32> %vec, <2 x i32> %subvec) {
+; CHECK-LABEL: @valid_insertion_i(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[SUBVEC:%.*]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT:    [[RESULT:%.*]] = shufflevector <8 x i32> [[VEC:%.*]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x i32> [[RESULT]]
+;
+  %result = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> %vec, <2 x i32> %subvec, i64 2)
+  ret <8 x i32> %result
+}
+
+; ============================================================================ ;
+; Scalable cases
+; ============================================================================ ;
+
+; Scalable insertions should not be canonicalized. This will be lowered to the
+; INSERT_SUBVECTOR ISD node later.
+define <vscale x 4 x i32> @scalable_insert(<vscale x 4 x i32> %vec, <4 x i32> %subvec) {
+; CHECK-LABEL: @scalable_insert(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> [[VEC:%.*]], <4 x i32> [[SUBVEC:%.*]], i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP1]]
+;
+  %1 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> %vec, <4 x i32> %subvec, i64 0)
+  ret <vscale x 4 x i32> %1
+}