From 6be9be5e0b4ba9d561dc005bea4eace40b53510d Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Sep 2024 15:42:05 +0200
Subject: [PATCH 01/43] [LLD][COFF][NFC] Store live flag in ImportThunkChunk.
 (#108459)

Instead of ImportFile. This is a preparation for ARM64EC support, which
has both x86 and ARM64EC thunks and each of them needs a separate flag.
---
 lld/COFF/Chunks.cpp     | 4 ++++
 lld/COFF/Chunks.h       | 7 +++++--
 lld/COFF/InputFiles.cpp | 2 +-
 lld/COFF/InputFiles.h   | 5 +----
 lld/COFF/MapFile.cpp    | 2 +-
 lld/COFF/MarkLive.cpp   | 2 +-
 lld/COFF/PDB.cpp        | 4 ++--
 lld/COFF/Symbols.cpp    | 2 +-
 lld/COFF/Symbols.h      | 4 ++--
 lld/COFF/Writer.cpp     | 2 +-
 10 files changed, 19 insertions(+), 15 deletions(-)
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index 0f33885f7df37..ee54fa39fc3d6 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -774,6 +774,10 @@ void StringChunk::writeTo(uint8_t *buf) const {
   buf[str.size()] = '\0';
 }
 
+ImportThunkChunk::ImportThunkChunk(COFFLinkerContext &ctx, Defined *s)
+    : NonSectionCodeChunk(ImportThunkKind), live(!ctx.config.doGC),
+      impSymbol(s), ctx(ctx) {}
+
 ImportThunkChunkX64::ImportThunkChunkX64(COFFLinkerContext &ctx, Defined *s)
     : ImportThunkChunk(ctx, s) {
   // Intel Optimization Manual says that all branch targets
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 040a249aabf59..8ad17a2850782 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -557,10 +557,13 @@ static const uint8_t importThunkARM64EC[] = {
 // contents will be a JMP instruction to some __imp_ symbol.
 class ImportThunkChunk : public NonSectionCodeChunk {
 public:
-  ImportThunkChunk(COFFLinkerContext &ctx, Defined *s)
-      : NonSectionCodeChunk(ImportThunkKind), impSymbol(s), ctx(ctx) {}
+  ImportThunkChunk(COFFLinkerContext &ctx, Defined *s);
   static bool classof(const Chunk *c) { return c->kind() == ImportThunkKind; }
 
+  // We track the usage of the thunk symbol separately from the import file
+  // to avoid generating unnecessary thunks.
+  bool live;
+
 protected:
   Defined *impSymbol;
   COFFLinkerContext &ctx;
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 569220468e96a..ee39b46624444 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1002,7 +1002,7 @@ void ObjFile::enqueuePdbFile(StringRef path, ObjFile *fromFile) {
 }
 
 ImportFile::ImportFile(COFFLinkerContext &ctx, MemoryBufferRef m)
-    : InputFile(ctx, ImportKind, m), live(!ctx.config.doGC), thunkLive(live) {}
+    : InputFile(ctx, ImportKind, m), live(!ctx.config.doGC) {}
 
 MachineTypes ImportFile::getMachineType() const {
   uint16_t machine =
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 8140a031f7116..0812e9c461045 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -371,11 +371,8 @@ class ImportFile : public InputFile {
   // are actually in use.
   //
   // If the Live bit is turned off by MarkLive, Writer will ignore dllimported
-  // symbols provided by this import library member. We also track whether the
-  // imported symbol is used separately from whether the thunk is used in order
-  // to avoid creating unnecessary thunks.
+  // symbols provided by this import library member.
   bool live;
-  bool thunkLive;
 };
 
 // Used for LTO.
diff --git a/lld/COFF/MapFile.cpp b/lld/COFF/MapFile.cpp
index ed521dd375ed0..52e9ce996f239 100644
--- a/lld/COFF/MapFile.cpp
+++ b/lld/COFF/MapFile.cpp
@@ -125,7 +125,7 @@ static void getSymbols(const COFFLinkerContext &ctx,
     if (!file->thunkSym)
       continue;
 
-    if (!file->thunkLive)
+    if (!file->thunkSym->isLive())
       continue;
 
     if (auto *thunkSym = dyn_cast<Defined>(file->thunkSym))
diff --git a/lld/COFF/MarkLive.cpp b/lld/COFF/MarkLive.cpp
index 8af58780e1358..3c09baa73a9f7 100644
--- a/lld/COFF/MarkLive.cpp
+++ b/lld/COFF/MarkLive.cpp
@@ -58,7 +58,7 @@ void markLive(COFFLinkerContext &ctx) {
       addImportFile(sym->file);
     } else if (auto *sym = dyn_cast<DefinedImportThunk>(b)) {
       addImportFile(sym->wrappedSym->file);
-      sym->wrappedSym->file->thunkLive = true;
+      sym->getChunk()->live = true;
     }
   };
 
diff --git a/lld/COFF/PDB.cpp b/lld/COFF/PDB.cpp
index c0739b37aeb0f..9b035f53ef49c 100644
--- a/lld/COFF/PDB.cpp
+++ b/lld/COFF/PDB.cpp
@@ -1527,8 +1527,8 @@ void PDBLinker::addImportFilesToPDB() {
     if (!file->thunkSym)
       continue;
 
-    if (!file->thunkLive)
-        continue;
+    if (!file->thunkSym->isLive())
+      continue;
 
     std::string dll = StringRef(file->dllName).lower();
     llvm::pdb::DbiModuleDescriptorBuilder *&mod = dllToModuleDbi[dll];
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index 5f4d797f74a2d..567c2b93776c9 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -84,7 +84,7 @@ bool Symbol::isLive() const {
   if (auto *imp = dyn_cast<DefinedImportData>(this))
     return imp->file->live;
   if (auto *imp = dyn_cast<DefinedImportThunk>(this))
-    return imp->wrappedSym->file->thunkLive;
+    return imp->getChunk()->live;
   // Assume any other kind of symbol is live.
   return true;
 }
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 2df60a01ec813..9b21e09bf83a4 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -395,12 +395,12 @@ class DefinedImportThunk : public Defined {
   }
 
   uint64_t getRVA() { return data->getRVA(); }
-  Chunk *getChunk() { return data; }
+  ImportThunkChunk *getChunk() const { return data; }
 
   DefinedImportData *wrappedSym;
 
 private:
-  Chunk *data;
+  ImportThunkChunk *data;
 };
 
 // If you have a symbol "foo" in your object file, a symbol name
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 9a8040008e73c..0b3c4163020f4 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1258,7 +1258,7 @@ void Writer::appendImportThunks() {
     if (!isa<DefinedImportThunk>(file->thunkSym))
       fatal(toString(ctx, *file->thunkSym) + " was replaced");
     DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
-    if (file->thunkLive)
+    if (thunk->getChunk()->live)
       textSec->addChunk(thunk->getChunk());
     if (file->impchkThunk)
       textSec->addChunk(file->impchkThunk);

From 387bee91f095c197270b4d0a9e19cc86b2edea73 Mon Sep 17 00:00:00 2001
From: JOE1994 <joseph942010@gmail.com>
Date: Fri, 13 Sep 2024 09:35:57 -0400
Subject: [PATCH 02/43] [llvm][unittests] Strip unneeded uses of
 raw_string_ostream::str() (NFC)

Avoid excess layer of indirection.
---
 llvm/unittests/Bitcode/BitReaderTest.cpp      |  2 +-
 .../CodeGen/GlobalISel/GISelMITest.cpp        |  4 +-
 .../CodeGen/GlobalISel/LegalizerTest.cpp      |  2 +-
 llvm/unittests/CodeGen/MachineInstrTest.cpp   |  5 +--
 llvm/unittests/CodeGen/MachineOperandTest.cpp | 42 +++++++++----------
 5 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/llvm/unittests/Bitcode/BitReaderTest.cpp b/llvm/unittests/Bitcode/BitReaderTest.cpp
index 22cc5e7492803..aea66fc1d8db5 100644
--- a/llvm/unittests/Bitcode/BitReaderTest.cpp
+++ b/llvm/unittests/Bitcode/BitReaderTest.cpp
@@ -38,7 +38,7 @@ std::unique_ptr<Module> parseAssembly(LLVMContext &Context,
 
   // A failure here means that the test itself is buggy.
   if (!M)
-    report_fatal_error(OS.str().c_str());
+    report_fatal_error(ErrMsg.c_str());
 
   return M;
 }
diff --git a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
index db9fb3a2d316e..b0dbd4a10b0a7 100644
--- a/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/GISelMITest.cpp
@@ -14,7 +14,7 @@ operator<<(std::ostream &OS, const LLT Ty) {
   std::string Repr;
   raw_string_ostream SS{Repr};
   Ty.print(SS);
-  OS << SS.str();
+  OS << Repr;
   return OS;
 }
 
@@ -23,7 +23,7 @@ operator<<(std::ostream &OS, const MachineFunction &MF) {
   std::string Repr;
   raw_string_ostream SS{Repr};
   MF.print(SS);
-  OS << SS.str();
+  OS << Repr;
   return OS;
 }
 
diff --git a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
index 401d04954a669..625e2c92b1119 100644
--- a/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
+++ b/llvm/unittests/CodeGen/GlobalISel/LegalizerTest.cpp
@@ -27,7 +27,7 @@ ::testing::AssertionResult isNullMIPtr(const MachineInstr *MI) {
   MI->print(MISStream, /*IsStandalone=*/true, /*SkipOpers=*/false,
             /*SkipDebugLoc=*/false, /*AddNewLine=*/false);
   return ::testing::AssertionFailure()
-         << "unable to legalize instruction: " << MISStream.str();
+         << "unable to legalize instruction: " << MIBuffer;
 }
 
 DefineLegalizerInfo(ALegalizer, {
diff --git a/llvm/unittests/CodeGen/MachineInstrTest.cpp b/llvm/unittests/CodeGen/MachineInstrTest.cpp
index af25acbb38fd5..d1546cf96f8d7 100644
--- a/llvm/unittests/CodeGen/MachineInstrTest.cpp
+++ b/llvm/unittests/CodeGen/MachineInstrTest.cpp
@@ -223,9 +223,8 @@ TEST(MachineInstrPrintingTest, DebugLocPrinting) {
   raw_string_ostream OS(str);
   MI->print(OS, /*IsStandalone*/true, /*SkipOpers*/false, /*SkipDebugLoc*/false,
             /*AddNewLine*/false);
-  ASSERT_TRUE(
-      StringRef(OS.str()).starts_with("$noreg = UNKNOWN debug-location "));
-  ASSERT_TRUE(StringRef(OS.str()).ends_with("filename:1:5"));
+  ASSERT_TRUE(StringRef(str).starts_with("$noreg = UNKNOWN debug-location "));
+  ASSERT_TRUE(StringRef(str).ends_with("filename:1:5"));
 }
 
 TEST(MachineInstrSpan, DistanceBegin) {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 8465c8b4f5394..63059d3267f71 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -73,7 +73,7 @@ TEST(MachineOperandTest, PrintRegisterMask) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<regmask ...>");
+  ASSERT_TRUE(str == "<regmask ...>");
 }
 
 TEST(MachineOperandTest, PrintSubReg) {
@@ -94,7 +94,7 @@ TEST(MachineOperandTest, PrintSubReg) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "$physreg1.subreg5");
+  ASSERT_TRUE(str == "$physreg1.subreg5");
 }
 
 TEST(MachineOperandTest, PrintCImm) {
@@ -116,7 +116,7 @@ TEST(MachineOperandTest, PrintCImm) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "i128 18446744073709551616");
+  ASSERT_TRUE(str == "i128 18446744073709551616");
 }
 
 TEST(MachineOperandTest, PrintSubRegIndex) {
@@ -133,7 +133,7 @@ TEST(MachineOperandTest, PrintSubRegIndex) {
   std::string str;
   raw_string_ostream OS(str);
   MachineOperand::printSubRegIdx(OS, MO.getImm(), nullptr);
-  ASSERT_TRUE(OS.str() == "%subreg.3");
+  ASSERT_TRUE(str == "%subreg.3");
 }
 
 TEST(MachineOperandTest, PrintCPI) {
@@ -152,7 +152,7 @@ TEST(MachineOperandTest, PrintCPI) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "%const.0 + 8");
+    ASSERT_TRUE(str == "%const.0 + 8");
   }
 
   str.clear();
@@ -164,7 +164,7 @@ TEST(MachineOperandTest, PrintCPI) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "%const.0 - 12");
+    ASSERT_TRUE(str == "%const.0 - 12");
   }
 }
 
@@ -183,7 +183,7 @@ TEST(MachineOperandTest, PrintTargetIndexName) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "target-index(<unknown>) + 8");
+    ASSERT_TRUE(str == "target-index(<unknown>) + 8");
   }
 
   str.clear();
@@ -194,7 +194,7 @@ TEST(MachineOperandTest, PrintTargetIndexName) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "target-index(<unknown>) - 12");
+    ASSERT_TRUE(str == "target-index(<unknown>) - 12");
   }
 }
 
@@ -211,7 +211,7 @@ TEST(MachineOperandTest, PrintJumpTableIndex) {
   std::string str;
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "%jump-table.3");
+  ASSERT_TRUE(str == "%jump-table.3");
 }
 
 TEST(MachineOperandTest, PrintExternalSymbol) {
@@ -228,7 +228,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo");
+    ASSERT_TRUE(str == "&foo");
   }
 
   str.clear();
@@ -238,7 +238,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo + 12");
+    ASSERT_TRUE(str == "&foo + 12");
   }
 
   str.clear();
@@ -248,7 +248,7 @@ TEST(MachineOperandTest, PrintExternalSymbol) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "&foo - 12");
+    ASSERT_TRUE(str == "&foo - 12");
   }
 }
 
@@ -274,7 +274,7 @@ TEST(MachineOperandTest, PrintGlobalAddress) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "@foo + 12");
+    ASSERT_TRUE(str == "@foo + 12");
   }
 
   str.clear();
@@ -284,7 +284,7 @@ TEST(MachineOperandTest, PrintGlobalAddress) {
   {
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "@foo - 12");
+    ASSERT_TRUE(str == "@foo - 12");
   }
 }
 
@@ -302,7 +302,7 @@ TEST(MachineOperandTest, PrintRegisterLiveOut) {
   // Print a MachineOperand containing a register live out list without a TRI.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "liveout(<unknown>)");
+  ASSERT_TRUE(str == "liveout(<unknown>)");
 }
 
 TEST(MachineOperandTest, PrintMetadata) {
@@ -328,7 +328,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "!0");
+  ASSERT_TRUE(str == "!0");
 }
 
 TEST(MachineOperandTest, PrintMCSymbol) {
@@ -349,7 +349,7 @@ TEST(MachineOperandTest, PrintMCSymbol) {
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<mcsymbol foo>");
+  ASSERT_TRUE(str == "<mcsymbol foo>");
 }
 
 TEST(MachineOperandTest, PrintCFI) {
@@ -366,7 +366,7 @@ TEST(MachineOperandTest, PrintCFI) {
   // attached to it.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "<cfi directive>");
+  ASSERT_TRUE(str == "<cfi directive>");
 }
 
 TEST(MachineOperandTest, PrintIntrinsicID) {
@@ -383,7 +383,7 @@ TEST(MachineOperandTest, PrintIntrinsicID) {
     // Print a MachineOperand containing a generic intrinsic ID.
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "intrinsic(@llvm.bswap)");
+    ASSERT_TRUE(str == "intrinsic(@llvm.bswap)");
   }
 
   str.clear();
@@ -394,7 +394,7 @@ TEST(MachineOperandTest, PrintIntrinsicID) {
     // IntrinsicInfo.
     raw_string_ostream OS(str);
     MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-    ASSERT_TRUE(OS.str() == "intrinsic(4294967295)");
+    ASSERT_TRUE(str == "intrinsic(4294967295)");
   }
 }
 
@@ -411,7 +411,7 @@ TEST(MachineOperandTest, PrintPredicate) {
   // Print a MachineOperand containing a int predicate ICMP_EQ.
   raw_string_ostream OS(str);
   MO.print(OS, /*TRI=*/nullptr, /*IntrinsicInfo=*/nullptr);
-  ASSERT_TRUE(OS.str() == "intpred(eq)");
+  ASSERT_TRUE(str == "intpred(eq)");
 }
 
 TEST(MachineOperandTest, HashValue) {

From 69a21154caa5b53d302cd3bfd7ce0ec1a0c3d985 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 13 Sep 2024 15:13:58 +0100
Subject: [PATCH 03/43] [DAG] Fold trunc(srl(extract_elt(vec,c1),c2)) ->
 extract_elt(bitcast(vec),c3) (#107987)

Extends existing trunc(extract_elt(vec,c1)) -> extract_elt(bitcast(vec),c3) fold.

Noticed while working on #107404
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 48 ++++++++++++-------
 llvm/test/CodeGen/AArch64/expand-select.ll    | 22 ++++-----
 2 files changed, 41 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index bb907633e1f82..fe8ae5c9e9af6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15142,26 +15142,42 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   // Note: We only run this optimization after type legalization (which often
   // creates this pattern) and before operation legalization after which
   // we need to be more careful about the vector instructions that we generate.
-  if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-      LegalTypes && !LegalOperations && N0->hasOneUse() && VT != MVT::i1) {
-    EVT VecTy = N0.getOperand(0).getValueType();
-    EVT ExTy = N0.getValueType();
+  if (LegalTypes && !LegalOperations && VT.isScalarInteger() && VT != MVT::i1 &&
+      N0->hasOneUse()) {
     EVT TrTy = N->getValueType(0);
+    SDValue Src = N0;
+
+    // Check for cases where we shift down an upper element before truncation.
+    int EltOffset = 0;
+    if (Src.getOpcode() == ISD::SRL && Src.getOperand(0)->hasOneUse()) {
+      if (auto ShAmt = DAG.getValidShiftAmount(Src)) {
+        if ((*ShAmt % TrTy.getSizeInBits()) == 0) {
+          Src = Src.getOperand(0);
+          EltOffset = *ShAmt / TrTy.getSizeInBits();
+        }
+      }
+    }
 
-    auto EltCnt = VecTy.getVectorElementCount();
-    unsigned SizeRatio = ExTy.getSizeInBits()/TrTy.getSizeInBits();
-    auto NewEltCnt = EltCnt * SizeRatio;
+    if (Src.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+      EVT VecTy = Src.getOperand(0).getValueType();
+      EVT ExTy = Src.getValueType();
 
-    EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
-    assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+      auto EltCnt = VecTy.getVectorElementCount();
+      unsigned SizeRatio = ExTy.getSizeInBits() / TrTy.getSizeInBits();
+      auto NewEltCnt = EltCnt * SizeRatio;
 
-    SDValue EltNo = N0->getOperand(1);
-    if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
-      int Elt = EltNo->getAsZExtVal();
-      int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
-                         DAG.getBitcast(NVT, N0.getOperand(0)),
-                         DAG.getVectorIdxConstant(Index, DL));
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), TrTy, NewEltCnt);
+      assert(NVT.getSizeInBits() == VecTy.getSizeInBits() && "Invalid Size");
+
+      SDValue EltNo = Src->getOperand(1);
+      if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
+        int Elt = EltNo->getAsZExtVal();
+        int Index = isLE ? (Elt * SizeRatio + EltOffset)
+                         : (Elt * SizeRatio + (SizeRatio - 1) - EltOffset);
+        return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
+                           DAG.getBitcast(NVT, Src.getOperand(0)),
+                           DAG.getVectorIdxConstant(Index, DL));
+      }
     }
   }
 
diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll
index f8397290ab5e1..1ed2e09c6b4d4 100644
--- a/llvm/test/CodeGen/AArch64/expand-select.ll
+++ b/llvm/test/CodeGen/AArch64/expand-select.ll
@@ -33,24 +33,20 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, ptr %Out) {
 ; CHECK:       // %bb.0:
 ; CHECK-NEXT:    and w8, w0, #0x1
 ; CHECK-NEXT:    fmov s0, wzr
-; CHECK-NEXT:    ldr x11, [sp, #16]
+; CHECK-NEXT:    ldr x10, [sp, #16]
 ; CHECK-NEXT:    fmov s1, w8
-; CHECK-NEXT:    ldp x9, x10, [sp]
 ; CHECK-NEXT:    cmeq v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    dup v1.4s, v0.s[0]
-; CHECK-NEXT:    mov x8, v1.d[1]
-; CHECK-NEXT:    lsr x8, x8, #32
-; CHECK-NEXT:    tst w8, #0x1
 ; CHECK-NEXT:    fmov w8, s0
-; CHECK-NEXT:    csel x10, x5, x10, ne
-; CHECK-NEXT:    csel x9, x4, x9, ne
-; CHECK-NEXT:    stur x9, [x11, #12]
 ; CHECK-NEXT:    tst w8, #0x1
-; CHECK-NEXT:    str w10, [x11, #20]
-; CHECK-NEXT:    csel x8, x2, x6, ne
+; CHECK-NEXT:    ldp x9, x8, [sp]
+; CHECK-NEXT:    csel x11, x2, x6, ne
+; CHECK-NEXT:    str x11, [x10]
+; CHECK-NEXT:    csel x9, x4, x9, ne
+; CHECK-NEXT:    csel x8, x5, x8, ne
+; CHECK-NEXT:    stur x9, [x10, #12]
 ; CHECK-NEXT:    csel x9, x3, x7, ne
-; CHECK-NEXT:    str x8, [x11]
-; CHECK-NEXT:    str w9, [x11, #8]
+; CHECK-NEXT:    str w8, [x10, #20]
+; CHECK-NEXT:    str w9, [x10, #8]
 ; CHECK-NEXT:    ret
   %cond = and i32 %In1, 1
   %cbool = icmp eq i32 %cond, 0

From a3ea018d4900691e10ff6fd059cf07f33e949819 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Sep 2024 07:22:49 -0700
Subject: [PATCH 04/43] [X86] Use MCRegister in X86AsmParser. (#108509)

---
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp | 112 +++++++++---------
 1 file changed, 57 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 6b4e47a49eb17..735f9dcefb97f 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -434,7 +434,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   class IntelExprStateMachine {
     IntelExprState State = IES_INIT, PrevState = IES_ERROR;
-    unsigned BaseReg = 0, IndexReg = 0, TmpReg = 0, Scale = 0;
+    MCRegister BaseReg, IndexReg, TmpReg;
+    unsigned Scale = 0;
     int64_t Imm = 0;
     const MCExpr *Sym = nullptr;
     StringRef SymName;
@@ -468,8 +469,8 @@ class X86AsmParser : public MCTargetAsmParser {
     bool isBracketUsed() const { return BracketUsed; }
     bool isOffsetOperator() const { return OffsetOperator; }
     SMLoc getOffsetLoc() const { return OffsetOperatorLoc; }
-    unsigned getBaseReg() const { return BaseReg; }
-    unsigned getIndexReg() const { return IndexReg; }
+    MCRegister getBaseReg() const { return BaseReg; }
+    MCRegister getIndexReg() const { return IndexReg; }
     unsigned getScale() const { return Scale; }
     const MCExpr *getSym() const { return Sym; }
     StringRef getSymName() const { return SymName; }
@@ -791,7 +792,7 @@ class X86AsmParser : public MCTargetAsmParser {
       }
       PrevState = CurrState;
     }
-    bool onRegister(unsigned Reg, StringRef &ErrMsg) {
+    bool onRegister(MCRegister Reg, StringRef &ErrMsg) {
       IntelExprState CurrState = State;
       switch (State) {
       default:
@@ -1111,8 +1112,8 @@ class X86AsmParser : public MCTargetAsmParser {
 
   std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
   std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
-  bool IsSIReg(unsigned Reg);
-  unsigned GetSIDIForRegClass(unsigned RegClassID, unsigned Reg, bool IsSIReg);
+  bool IsSIReg(MCRegister Reg);
+  MCRegister GetSIDIForRegClass(unsigned RegClassID, bool IsSIReg);
   void
   AddDefaultSrcDestOperands(OperandVector &Operands,
                             std::unique_ptr<llvm::MCParsedAsmOperand> &&Src,
@@ -1145,14 +1146,14 @@ class X86AsmParser : public MCTargetAsmParser {
   void tryParseOperandIdx(AsmToken::TokenKind PrevTK,
                           IntelExprStateMachine &SM);
 
-  bool ParseMemOperand(unsigned SegReg, const MCExpr *Disp, SMLoc StartLoc,
+  bool ParseMemOperand(MCRegister SegReg, const MCExpr *Disp, SMLoc StartLoc,
                        SMLoc EndLoc, OperandVector &Operands);
 
   X86::CondCode ParseConditionCode(StringRef CCode);
 
   bool ParseIntelMemoryOperandSize(unsigned &Size);
-  bool CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                               unsigned BaseReg, unsigned IndexReg,
+  bool CreateMemForMSInlineAsm(MCRegister SegReg, const MCExpr *Disp,
+                               MCRegister BaseReg, MCRegister IndexReg,
                                unsigned Scale, bool NonAbsMem, SMLoc Start,
                                SMLoc End, unsigned Size, StringRef Identifier,
                                const InlineAsmIdentifierInfo &Info,
@@ -1300,14 +1301,15 @@ class X86AsmParser : public MCTargetAsmParser {
 #define GET_SUBTARGET_FEATURE_NAME
 #include "X86GenAsmMatcher.inc"
 
-static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
-                                            unsigned Scale, bool Is64BitMode,
+static bool CheckBaseRegAndIndexRegAndScale(MCRegister BaseReg,
+                                            MCRegister IndexReg, unsigned Scale,
+                                            bool Is64BitMode,
                                             StringRef &ErrMsg) {
   // If we have both a base register and an index register make sure they are
   // both 64-bit or 32-bit registers.
   // To support VSIB, IndexReg can be 128-bit or 256-bit registers.
 
-  if (BaseReg != 0 &&
+  if (BaseReg &&
       !(BaseReg == X86::RIP || BaseReg == X86::EIP ||
         X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg) ||
         X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg) ||
@@ -1316,7 +1318,7 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (IndexReg != 0 &&
+  if (IndexReg &&
       !(IndexReg == X86::EIZ || IndexReg == X86::RIZ ||
         X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
         X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
@@ -1328,9 +1330,9 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg != 0) ||
-      IndexReg == X86::EIP || IndexReg == X86::RIP ||
-      IndexReg == X86::ESP || IndexReg == X86::RSP) {
+  if (((BaseReg == X86::RIP || BaseReg == X86::EIP) && IndexReg) ||
+      IndexReg == X86::EIP || IndexReg == X86::RIP || IndexReg == X86::ESP ||
+      IndexReg == X86::RSP) {
     ErrMsg = "invalid base+index expression";
     return true;
   }
@@ -1344,13 +1346,13 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
     return true;
   }
 
-  if (BaseReg == 0 &&
+  if (!BaseReg &&
       X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg)) {
     ErrMsg = "16-bit memory operand may not include only index register";
     return true;
   }
 
-  if (BaseReg != 0 && IndexReg != 0) {
+  if (BaseReg && IndexReg) {
     if (X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg) &&
         (X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg) ||
          X86MCRegisterClasses[X86::GR32RegClassID].contains(IndexReg) ||
@@ -1380,8 +1382,7 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
   }
 
   // RIP/EIP-relative addressing is only supported in 64-bit mode.
-  if (!Is64BitMode && BaseReg != 0 &&
-      (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
+  if (!Is64BitMode && (BaseReg == X86::RIP || BaseReg == X86::EIP)) {
     ErrMsg = "IP-relative addressing requires 64-bit mode";
     return true;
   }
@@ -1608,7 +1609,8 @@ ParseStatus X86AsmParser::tryParseRegister(MCRegister &Reg, SMLoc &StartLoc,
 
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   bool Parse32 = is32BitMode() || Code16GCC;
-  unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
+  MCRegister Basereg =
+      is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
   const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
@@ -1617,15 +1619,16 @@ std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
 
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemDIOperand(SMLoc Loc) {
   bool Parse32 = is32BitMode() || Code16GCC;
-  unsigned Basereg = is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
+  MCRegister Basereg =
+      is64BitMode() ? X86::RDI : (Parse32 ? X86::EDI : X86::DI);
   const MCExpr *Disp = MCConstantExpr::create(0, getContext());
   return X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
                                /*BaseReg=*/Basereg, /*IndexReg=*/0, /*Scale=*/1,
                                Loc, Loc, 0);
 }
 
-bool X86AsmParser::IsSIReg(unsigned Reg) {
-  switch (Reg) {
+bool X86AsmParser::IsSIReg(MCRegister Reg) {
+  switch (Reg.id()) {
   default: llvm_unreachable("Only (R|E)SI and (R|E)DI are expected!");
   case X86::RSI:
   case X86::ESI:
@@ -1638,8 +1641,7 @@ bool X86AsmParser::IsSIReg(unsigned Reg) {
   }
 }
 
-unsigned X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, unsigned Reg,
-                                          bool IsSIReg) {
+MCRegister X86AsmParser::GetSIDIForRegClass(unsigned RegClassID, bool IsSIReg) {
   switch (RegClassID) {
   default: llvm_unreachable("Unexpected register class");
   case X86::GR64RegClassID:
@@ -1690,8 +1692,8 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
           // Return false and let a normal complaint about bogus operands happen
           return false;
 
-        unsigned OrigReg = OrigOp.Mem.BaseReg;
-        unsigned FinalReg = FinalOp.Mem.BaseReg;
+        MCRegister OrigReg = OrigOp.Mem.BaseReg;
+        MCRegister FinalReg = FinalOp.Mem.BaseReg;
 
         // If we've already encounterd a register class, make sure all register
         // bases are of the same register class
@@ -1713,7 +1715,7 @@ bool X86AsmParser::VerifyAndAdjustOperands(OperandVector &OrigOperands,
           return false;
 
         bool IsSI = IsSIReg(FinalReg);
-        FinalReg = GetSIDIForRegClass(RegClassID, FinalReg, IsSI);
+        FinalReg = GetSIDIForRegClass(RegClassID, IsSI);
 
         if (FinalReg != OrigReg) {
           std::string RegName = IsSI ? "ES:(R|E)SI" : "ES:(R|E)DI";
@@ -1753,13 +1755,11 @@ bool X86AsmParser::parseOperand(OperandVector &Operands, StringRef Name) {
   return parseATTOperand(Operands);
 }
 
-bool X86AsmParser::CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp,
-                                           unsigned BaseReg, unsigned IndexReg,
-                                           unsigned Scale, bool NonAbsMem,
-                                           SMLoc Start, SMLoc End,
-                                           unsigned Size, StringRef Identifier,
-                                           const InlineAsmIdentifierInfo &Info,
-                                           OperandVector &Operands) {
+bool X86AsmParser::CreateMemForMSInlineAsm(
+    MCRegister SegReg, const MCExpr *Disp, MCRegister BaseReg,
+    MCRegister IndexReg, unsigned Scale, bool NonAbsMem, SMLoc Start, SMLoc End,
+    unsigned Size, StringRef Identifier, const InlineAsmIdentifierInfo &Info,
+    OperandVector &Operands) {
   // If we found a decl other than a VarDecl, then assume it is a FuncDecl or
   // some other label reference.
   if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
@@ -2651,10 +2651,10 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
   }
 
   StringRef ErrMsg;
-  unsigned BaseReg = SM.getBaseReg();
-  unsigned IndexReg = SM.getIndexReg();
+  MCRegister BaseReg = SM.getBaseReg();
+  MCRegister IndexReg = SM.getIndexReg();
   if (IndexReg && BaseReg == X86::RIP)
-    BaseReg = 0;
+    BaseReg = MCRegister();
   unsigned Scale = SM.getScale();
   if (!PtrInOperand)
     Size = SM.getElementSize() << 3;
@@ -2703,7 +2703,7 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
 
   // When parsing x64 MS-style assembly, all non-absolute references to a named
   // variable default to RIP-relative.
-  unsigned DefaultBaseReg = X86::NoRegister;
+  MCRegister DefaultBaseReg;
   bool MaybeDirectBranchDest = true;
 
   if (Parser.isParsingMasm()) {
@@ -2738,7 +2738,7 @@ bool X86AsmParser::parseIntelOperand(OperandVector &Operands, StringRef Name) {
       MaybeDirectBranchDest = false;
   }
 
-  if ((BaseReg || IndexReg || RegNo || DefaultBaseReg != X86::NoRegister))
+  if ((BaseReg || IndexReg || RegNo || DefaultBaseReg))
     Operands.push_back(X86Operand::CreateMem(
         getPointerWidth(), RegNo, Disp, BaseReg, IndexReg, Scale, Start, End,
         Size, DefaultBaseReg, /*SymName=*/StringRef(), /*OpDecl=*/nullptr,
@@ -2782,7 +2782,7 @@ bool X86AsmParser::parseATTOperand(OperandVector &Operands) {
 
     SMLoc Loc = Parser.getTok().getLoc(), EndLoc;
     const MCExpr *Expr = nullptr;
-    unsigned Reg = 0;
+    MCRegister Reg;
     if (getLexer().isNot(AsmToken::LParen)) {
       // No '(' so this is either a displacement expression or a register.
       if (Parser.parseExpression(Expr, EndLoc))
@@ -2954,7 +2954,7 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands) {
 
 /// ParseMemOperand: 'seg : disp(basereg, indexreg, scale)'.  The '%ds:' prefix
 /// has already been parsed if present. disp may be provided as well.
-bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
+bool X86AsmParser::ParseMemOperand(MCRegister SegReg, const MCExpr *Disp,
                                    SMLoc StartLoc, SMLoc EndLoc,
                                    OperandVector &Operands) {
   MCAsmParser &Parser = getParser();
@@ -3041,7 +3041,8 @@ bool X86AsmParser::ParseMemOperand(unsigned SegReg, const MCExpr *Disp,
 
   // If we reached here, then eat the '(' and Process
   // the rest of the memory operand.
-  unsigned BaseReg = 0, IndexReg = 0, Scale = 1;
+  MCRegister BaseReg, IndexReg;
+  unsigned Scale = 1;
   SMLoc BaseLoc = getLexer().getLoc();
   const MCExpr *E;
   StringRef ErrMsg;
@@ -3888,14 +3889,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   uint64_t TSFlags = MII.get(Opcode).TSFlags;
   if (isVFCMADDCPH(Opcode) || isVFCMADDCSH(Opcode) || isVFMADDCPH(Opcode) ||
       isVFMADDCSH(Opcode)) {
-    unsigned Dest = Inst.getOperand(0).getReg();
+    MCRegister Dest = Inst.getOperand(0).getReg();
     for (unsigned i = 2; i < Inst.getNumOperands(); i++)
       if (Inst.getOperand(i).isReg() && Dest == Inst.getOperand(i).getReg())
         return Warning(Ops[0]->getStartLoc(), "Destination register should be "
                                               "distinct from source registers");
   } else if (isVFCMULCPH(Opcode) || isVFCMULCSH(Opcode) || isVFMULCPH(Opcode) ||
              isVFMULCSH(Opcode)) {
-    unsigned Dest = Inst.getOperand(0).getReg();
+    MCRegister Dest = Inst.getOperand(0).getReg();
     // The mask variants have different operand list. Scan from the third
     // operand to avoid emitting incorrect warning.
     //    VFMULCPHZrr   Dest, Src1, Src2
@@ -3909,8 +3910,9 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   } else if (isV4FMADDPS(Opcode) || isV4FMADDSS(Opcode) ||
              isV4FNMADDPS(Opcode) || isV4FNMADDSS(Opcode) ||
              isVP4DPWSSDS(Opcode) || isVP4DPWSSD(Opcode)) {
-    unsigned Src2 = Inst.getOperand(Inst.getNumOperands() -
-                                    X86::AddrNumOperands - 1).getReg();
+    MCRegister Src2 =
+        Inst.getOperand(Inst.getNumOperands() - X86::AddrNumOperands - 1)
+            .getReg();
     unsigned Src2Enc = MRI->getEncodingValue(Src2);
     if (Src2Enc % 4 != 0) {
       StringRef RegName = X86IntelInstPrinter::getRegisterName(Src2);
@@ -3946,9 +3948,9 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   } else if (isTCMMIMFP16PS(Opcode) || isTCMMRLFP16PS(Opcode) ||
              isTDPBF16PS(Opcode) || isTDPFP16PS(Opcode) || isTDPBSSD(Opcode) ||
              isTDPBSUD(Opcode) || isTDPBUSD(Opcode) || isTDPBUUD(Opcode)) {
-    unsigned SrcDest = Inst.getOperand(0).getReg();
-    unsigned Src1 = Inst.getOperand(2).getReg();
-    unsigned Src2 = Inst.getOperand(3).getReg();
+    MCRegister SrcDest = Inst.getOperand(0).getReg();
+    MCRegister Src1 = Inst.getOperand(2).getReg();
+    MCRegister Src2 = Inst.getOperand(3).getReg();
     if (SrcDest == Src1 || SrcDest == Src2 || Src1 == Src2)
       return Error(Ops[0]->getStartLoc(), "all tmm registers must be distinct");
   }
@@ -3956,14 +3958,14 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
   // Check that we aren't mixing AH/BH/CH/DH with REX prefix. We only need to
   // check this with the legacy encoding, VEX/EVEX/XOP don't use REX.
   if ((TSFlags & X86II::EncodingMask) == 0) {
-    MCPhysReg HReg = X86::NoRegister;
+    MCRegister HReg;
     bool UsesRex = TSFlags & X86II::REX_W;
     unsigned NumOps = Inst.getNumOperands();
     for (unsigned i = 0; i != NumOps; ++i) {
       const MCOperand &MO = Inst.getOperand(i);
       if (!MO.isReg())
         continue;
-      unsigned Reg = MO.getReg();
+      MCRegister Reg = MO.getReg();
       if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
         HReg = Reg;
       if (X86II::isX86_64NonExtLowByteReg(Reg) ||
@@ -3971,7 +3973,7 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
         UsesRex = true;
     }
 
-    if (UsesRex && HReg != X86::NoRegister) {
+    if (UsesRex && HReg) {
       StringRef RegName = X86IntelInstPrinter::getRegisterName(HReg);
       return Error(Ops[0]->getStartLoc(),
                    "can't encode '" + RegName + "' in an instruction requiring "
@@ -4022,7 +4024,7 @@ void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
   case X86::RETI64: {
     MCInst ShlInst, FenceInst;
     bool Parse32 = is32BitMode() || Code16GCC;
-    unsigned Basereg =
+    MCRegister Basereg =
         is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP);
     const MCExpr *Disp = MCConstantExpr::create(0, getContext());
     auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,

From ee4582f9c8c395b1a9d901b522510af622206049 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Sep 2024 07:23:44 -0700
Subject: [PATCH 05/43] [RISCV] Use CCValAssign::getCustomReg for fixed vector
 arguments/returns with RVV. (#108470)

We need to insert a insert_subvector or extract_subvector which feels
pretty custom.

This should make it easier to support fixed vector arguments for GISel.
---
 llvm/lib/Target/RISCV/RISCVCallingConv.cpp  | 12 +++++++--
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 28 +++++++++------------
 2 files changed, 22 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
index fc276d1063281..b7ed9de6ca84d 100644
--- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp
@@ -448,8 +448,12 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (Reg) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
-      if (ValVT.isFixedLengthVector())
+      if (ValVT.isFixedLengthVector()) {
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
     } else {
       // For return values, the vector must be passed fully via registers or
       // via the stack.
@@ -583,8 +587,12 @@ bool llvm::CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
     if (MCRegister Reg = allocateRVVReg(ValVT, ValNo, State, TLI)) {
       // Fixed-length vectors are located in the corresponding scalable-vector
       // container types.
-      if (LocVT.isFixedLengthVector())
+      if (LocVT.isFixedLengthVector()) {
         LocVT = TLI.getContainerForFixedLengthVector(LocVT);
+        State.addLoc(
+            CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+        return false;
+      }
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
       return false;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 6f2dc710cb3d4..ab49315c12d68 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -19090,20 +19090,18 @@ static SDValue convertLocVTToValVT(SelectionDAG &DAG, SDValue Val,
   if (VA.needsCustom()) {
     if (VA.getLocVT().isInteger() &&
         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
-    else if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
-    else
-      llvm_unreachable("Unexpected Custom handling.");
-    return Val;
+      return DAG.getNode(RISCVISD::FMV_H_X, DL, VA.getValVT(), Val);
+    if (VA.getLocVT() == MVT::i64 && VA.getValVT() == MVT::f32)
+      return DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, Val);
+    if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector())
+      return convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
+    llvm_unreachable("Unexpected Custom handling.");
   }
 
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
   case CCValAssign::Full:
-    if (VA.getValVT().isFixedLengthVector() && VA.getLocVT().isScalableVector())
-      Val = convertFromScalableVector(VA.getValVT(), Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
     Val = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), Val);
@@ -19155,20 +19153,18 @@ static SDValue convertValVTToLocVT(SelectionDAG &DAG, SDValue Val,
   if (VA.needsCustom()) {
     if (LocVT.isInteger() &&
         (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
-    else if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32)
-      Val = DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
-    else
-      llvm_unreachable("Unexpected Custom handling.");
-    return Val;
+      return DAG.getNode(RISCVISD::FMV_X_ANYEXTH, DL, LocVT, Val);
+    if (LocVT == MVT::i64 && VA.getValVT() == MVT::f32)
+      return DAG.getNode(RISCVISD::FMV_X_ANYEXTW_RV64, DL, MVT::i64, Val);
+    if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector())
+      return convertToScalableVector(LocVT, Val, DAG, Subtarget);
+    llvm_unreachable("Unexpected Custom handling.");
   }
 
   switch (VA.getLocInfo()) {
   default:
     llvm_unreachable("Unexpected CCValAssign::LocInfo");
   case CCValAssign::Full:
-    if (VA.getValVT().isFixedLengthVector() && LocVT.isScalableVector())
-      Val = convertToScalableVector(LocVT, Val, DAG, Subtarget);
     break;
   case CCValAssign::BCvt:
     Val = DAG.getNode(ISD::BITCAST, DL, LocVT, Val);

From 4a9b6b05c50a66f7dac6871f89a76daf77827c8d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 11 Sep 2024 17:22:28 +0100
Subject: [PATCH 06/43] [X86] Cleanup lowerShuffleToEXPAND arg layout. NFC.

Reorder the arg layout to match (most) other lowerShuffle* calls.

Rename to lowerShuffleWithEXPAND to match other lowering cases where we lower to a single node.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 42 ++++++++++++-------------
 1 file changed, 21 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 3c5b952ff62e2..3597b864705ef 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -9927,11 +9927,11 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const SDLoc &dl);
 
 // X86 has dedicated shuffle that can be lowered to VEXPAND
-static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
-                                    const APInt &Zeroable,
-                                    ArrayRef<int> Mask, SDValue &V1,
-                                    SDValue &V2, SelectionDAG &DAG,
-                                    const X86Subtarget &Subtarget) {
+static SDValue lowerShuffleWithEXPAND(const SDLoc &DL, MVT VT, SDValue V1,
+                                      SDValue V2, ArrayRef<int> Mask,
+                                      const APInt &Zeroable,
+                                      const X86Subtarget &Subtarget,
+                                      SelectionDAG &DAG) {
   bool IsLeftZeroSide = true;
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
@@ -15966,8 +15966,8 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4f64, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4f64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
 
   // If we have AVX2 then we always want to lower with a blend because an v4 we
@@ -16046,8 +16046,8 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
       return Rotate;
 
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v4i64, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v4i64, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
   }
 
@@ -16184,8 +16184,8 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VEXPAND.
   if (Subtarget.hasVLX())
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f32, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f32, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
 
   // Try to match an interleave of two v8f32s and lower them as unpck and
@@ -16308,8 +16308,8 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
       return Rotate;
 
-    if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i32, Zeroable, Mask, V1, V2,
-                                         DAG, Subtarget))
+    if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i32, V1, V2, Mask,
+                                           Zeroable, Subtarget, DAG))
       return V;
   }
 
@@ -16827,8 +16827,8 @@ static SDValue lowerV8F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                           Zeroable, Subtarget, DAG))
     return Op;
 
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8f64, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8f64, V1, V2, Mask, Zeroable,
+                                         Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8f64, V1, V2, Mask,
@@ -16898,8 +16898,8 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   }
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
-                                             V1, V2, DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16f32, V1, V2, Mask,
+                                         Zeroable, Subtarget, DAG))
     return V;
 
   return lowerShuffleWithPERMV(DL, MVT::v16f32, Mask, V1, V2, Subtarget, DAG);
@@ -16967,8 +16967,8 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Unpck;
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v8i64, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v8i64, V1, V2, Mask, Zeroable,
+                                         Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v8i64, V1, V2, Mask,
@@ -17064,8 +17064,8 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   // If we have AVX512F support, we can use VEXPAND.
-  if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
-                                       DAG, Subtarget))
+  if (SDValue V = lowerShuffleWithEXPAND(DL, MVT::v16i32, V1, V2, Mask,
+                                         Zeroable, Subtarget, DAG))
     return V;
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,

From 326287fd5b7b38987dbfbe80013225485d261790 Mon Sep 17 00:00:00 2001
From: Thomas Preud'homme <thomas.preudhomme@arm.com>
Date: Fri, 13 Sep 2024 15:48:17 +0100
Subject: [PATCH 07/43] Add missing FillOp to winograd lowering (#108181)

Winograd lowering involves a number of matmul and batch_matmul which
are currently passed tensor.empty result as out parameter, thereby
are undefined behaviour. This commit adds the necessary linalg.fill.

---------

Co-authored-by: Max191 <44243577+Max191@users.noreply.github.com>
---
 .../Linalg/Transforms/WinogradConv2D.cpp      |  57 ++++++--
 .../transform-tile-and-winograd-rewrite.mlir  | 137 ++++++++++++------
 .../Linalg/winograd-conv2d-rewrite.mlir       |  58 ++++----
 mlir/test/Dialect/Linalg/winograd-conv2d.mlir |  71 +++++----
 4 files changed, 213 insertions(+), 110 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
index b65b18699a15a..80edf4a32c6df 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/WinogradConv2D.cpp
@@ -390,6 +390,8 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
     TransformMapKeyTy key = {m, r};
     int64_t retRows = 1;
     Value matmulRetValue = extractFilter;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix G.
       auto it = GMatrices.find(key);
@@ -399,8 +401,11 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 
       retRows = GMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, filterW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value G = create2DTransformMatrix(builder, loc, GMatrix, elementType);
       // Multiply G x g.
@@ -418,8 +423,11 @@ Value filterTransform(RewriterBase &rewriter, Location loc, Value filter,
 
       auto matmulType =
           RankedTensorType::get({retRows, GTMatrix.cols}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value GT = create2DTransformMatrix(builder, loc, GTMatrix, elementType);
       // Multiply u = (G x g) x GT.
@@ -523,6 +531,8 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
     int64_t retRows = 1;
     int64_t retCols = 1;
     Value matmulRetValue = extractInput;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix BT.
       auto it = BTMatrices.find(key);
@@ -532,8 +542,11 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
 
       retRows = BTMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, alphaW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value BT =
           create2DTransformMatrix(builder, loc, BTMatrix, builder.getF32Type());
@@ -552,8 +565,11 @@ Value inputTransform(RewriterBase &rewriter, Location loc, Value input,
 
       retCols = BMatrix.cols;
       auto matmulType = RankedTensorType::get({retRows, retCols}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
       Value B =
           create2DTransformMatrix(builder, loc, BMatrix, builder.getF32Type());
       // Multiply v = (BT x d) x B.
@@ -636,8 +652,13 @@ static Value matrixMultiply(RewriterBase &rewriter, Location loc,
       {inputShape[0] * inputShape[1],
        inputShape[2] * inputShape[3] * inputShape[4], filterShape[3]},
       outputElementType);
-  Value init = rewriter.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                outputElementType);
+  Value empty = rewriter
+                    .create<tensor::EmptyOp>(loc, matmulType.getShape(),
+                                             outputElementType)
+                    .getResult();
+  Value zero = rewriter.create<arith::ConstantOp>(
+      loc, rewriter.getZeroAttr(outputElementType));
+  Value init = rewriter.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
   auto matmulOp = rewriter.create<linalg::BatchMatmulOp>(
       loc, matmulType, ValueRange({collapseInput, collapseFilter}),
@@ -725,6 +746,8 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
     int64_t leftScalarFactor = 1;
     int64_t rightScalarFactor = 1;
     Value matmulRetValue = extractValue;
+    Value zero = builder.create<arith::ConstantOp>(
+        loc, rewriter.getZeroAttr(elementType));
     if (leftTransform) {
       // Get constant transform matrix AT.
       auto it = ATMatrices.find(key);
@@ -735,8 +758,11 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       leftScalarFactor = ATMatrix.scalarFactor;
       retRows = ATMatrix.rows;
       auto matmulType = RankedTensorType::get({retRows, valueW}, elementType);
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value AT = create2DTransformMatrix(builder, loc, ATMatrix, elementType);
       // Multiply AT x m.
@@ -756,8 +782,11 @@ Value outputTransform(RewriterBase &rewriter, Location loc, Value value,
       auto matmulType =
           RankedTensorType::get({retRows, AMatrix.cols}, elementType);
       retCols = AMatrix.cols;
-      auto init = builder.create<tensor::EmptyOp>(loc, matmulType.getShape(),
-                                                  elementType);
+      auto empty =
+          builder
+              .create<tensor::EmptyOp>(loc, matmulType.getShape(), elementType)
+              .getResult();
+      auto init = builder.create<linalg::FillOp>(loc, zero, empty).getResult(0);
 
       Value A = create2DTransformMatrix(builder, loc, AMatrix, elementType);
       // Multiply y = (AT x m) x A.
diff --git a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
index 6bb3fb1423edc..c5760acf94a88 100644
--- a/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/transform-tile-and-winograd-rewrite.mlir
@@ -36,6 +36,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
 // CHECK:  %[[CST:.*]] = arith.constant 1.024000e+03 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<6x4xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_3:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_4:.*]] = arith.constant dense<{{.*}}> : tensor<3x6xf32>
+// CHECK:  %[[CST_5:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
+// CHECK:  %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:  %[[C1:.*]] = arith.constant 1 : index
 // CHECK:  %[[C5:.*]] = arith.constant 5 : index
 // CHECK:  %[[C2:.*]] = arith.constant 2 : index
@@ -44,9 +51,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:  %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:      %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1]
-// CHECK:      %[[S11:.*]] = linalg.matmul
-// CHECK:      %[[S13:.*]] = linalg.matmul
-// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1]
+// CHECK:      %[[S10:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK:      %[[S11:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S10]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S12:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S11]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S13:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:      %[[S14:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[S15:.*]] = linalg.matmul ins(%[[S12]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1]
 // CHECK:      scf.yield %[[INSERTED_SLICE]]
 // CHECK:    scf.yield %[[S9]]
 // CHECK:  %[[S2:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32>
@@ -60,9 +71,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]])
 // CHECK:        %[[S13:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]])
 // CHECK:          %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1]
-// CHECK:          %[[S15:.*]] = linalg.matmul
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S17]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S16:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_8]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S17:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.matmul ins(%[[S16]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_9]]
 // CHECK:        scf.yield %[[S13]]
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][0, 0, %[[ARG3]], %[[ARG5]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1]
@@ -82,15 +97,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[EXTRACTED_SLICE_7]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]])
 // CHECK:          %[[EXTRACTED_SLICE_8:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE]][0, 0, 0, 0, %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[S19:.*]] = linalg.matmul
-// CHECK:          %[[S20:.*]] = tensor.empty()
-// CHECK:          %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) {
+// CHECK:          %[[S16:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK:          %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_8]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S19:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S20:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S19]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S21:.*]] = linalg.matmul ins(%[[S18]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S22:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S23:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S22]] : tensor<4x4xf32>) {
 // CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:            linalg.yield %[[IN]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S22]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[S24:.*]] = linalg.mul ins(%[[S23]], %[[S21]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S22]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[INSERTED_SLICE_9:.*]] = tensor.insert_slice %[[S24]] into %[[ARG10]][%[[ARG7]], 0, 0, %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_9]]
 // CHECK:        scf.yield %[[S15]]
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
@@ -114,14 +133,15 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %4 = tensor.empty() : tensor<36x18x2xf32>
-  %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-  %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%5 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
   %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg4: index, %arg5: index, %arg6: index, %arg7: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-  %extracted_slice = tensor.extract_slice %6[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %extracted_slice = tensor.extract_slice %7[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
 
@@ -147,18 +167,29 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d_unaligned
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x11x11x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<2x9x9x2xf32>) -> tensor<2x9x9x2xf32> {
 // CHECK:  %[[CST:.*]] = arith.constant 1.024000e+03 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<6x4xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_3:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
 // CHECK:  %[[C3:.*]] = arith.constant 3 : index
+// CHECK:  %[[CST_4:.*]] = arith.constant dense<{{.*}}> : tensor<3x6xf32>
+// CHECK:  %[[CST_5:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
 // CHECK:  %[[C1:.*]] = arith.constant 1 : index
 // CHECK:  %[[C5:.*]] = arith.constant 5 : index
 // CHECK:  %[[C2:.*]] = arith.constant 2 : index
 // CHECK:  %[[C0:.*]] = arith.constant 0 : index
+// CHECK:  %[[CST_6:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:  %[[S0:.*]] = tensor.empty()
 // CHECK:  %[[S1:.*]] = scf.for %[[ARG4:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG5:.*]] = %[[S0]])
 // CHECK:    %[[S9:.*]] = scf.for %[[ARG6:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG7:.*]] = %[[ARG5]])
 // CHECK:      %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG4]], 0, 0, %[[ARG6]]] [1, 3, 3, 1] [1, 1, 1, 1]
-// CHECK:      %[[S11:.*]] = linalg.matmul
-// CHECK:      %[[S13:.*]] = linalg.matmul
-// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S13]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1]
+// CHECK:      %[[S11:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK:      %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S13:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S12]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK:      %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:      %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S16]] into %[[ARG7]][0, 0, %[[ARG6]], %[[ARG4]]] [6, 6, 1, 1] [1, 1, 1, 1]
 // CHECK:      scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32>
 // CHECK:    scf.yield %[[S9]] : tensor<6x6x5x2xf32>
 // CHECK:  %[[PADDED:.*]] = tensor.pad %[[ARG0]] low[0, 0, 0, 0] high[0, 3, 3, 0]
@@ -173,9 +204,13 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]])
 // CHECK:        %[[S13:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]])
 // CHECK:          %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 6, 6, 1] [1, 1, 1, 1]
-// CHECK:          %[[S15:.*]] = linalg.matmul
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S17]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:          %[[S15:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S16:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S15]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S17:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_11]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S16]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S18:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S18]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[S20:.*]] = linalg.matmul ins(%[[S17]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S19]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S20]] into %[[ARG11]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_12]] : tensor<6x6x1x1x2x5xf32>
 // CHECK:        scf.yield %[[S13]] : tensor<6x6x1x1x2x5xf32>
 // CHECK:      %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG7]][0, 0, %[[ARG4]], %[[ARG6]], 0, 0] [6, 6, 1, 1, 2, 5] [1, 1, 1, 1, 1, 1]
@@ -196,15 +231,19 @@ module attributes {transform.with_named_sequence} {
 // CHECK:      %[[S12:.*]] = scf.for %[[ARG8:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG9:.*]] = %[[EXTRACTED_SLICE_10]])
 // CHECK:        %[[S15:.*]] = scf.for %[[ARG10:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG11:.*]] = %[[ARG9]])
 // CHECK:          %[[EXTRACTED_SLICE_11:.*]] = tensor.extract_slice %[[EXTRACTED_SLICE_9]][0, 0, 0, 0, %[[ARG8]], %[[ARG10]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:          %[[S17:.*]] = linalg.matmul
-// CHECK:          %[[S19:.*]] = linalg.matmul
+// CHECK:          %[[S17:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK:          %[[S18:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S17]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK:          %[[S19:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_11]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S18]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK:          %[[S20:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK:          %[[S21:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S20]] : tensor<4x4xf32>) {
+// CHECK:          %[[S21:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S22:.*]] = linalg.matmul ins(%[[S19]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S21]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[S23:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK:          %[[S24:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S23]] : tensor<4x4xf32>) {
 // CHECK:          ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:            linalg.yield %[[IN]] : f32
 // CHECK:          } -> tensor<4x4xf32>
-// CHECK:          %[[S22:.*]] = linalg.mul ins(%[[S21]], %[[S19]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S20]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S22]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
+// CHECK:          %[[S25:.*]] = linalg.mul ins(%[[S24]], %[[S22]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S23]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK:          %[[INSERTED_SLICE_12:.*]] = tensor.insert_slice %[[S25]] into %[[ARG11]][%[[ARG8]], 0, 0, %[[ARG10]]] [1, 4, 4, 1] [1, 1, 1, 1]
 // CHECK:          scf.yield %[[INSERTED_SLICE_12]]
 // CHECK:        scf.yield %[[S15]] : tensor<2x4x4x2xf32>
 // CHECK:      %[[S13:.*]] = affine.apply #[[$MAP0]](%[[ARG4]])
@@ -218,6 +257,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>, %arg2: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
   %0 = tensor.empty() : tensor<6x1x5x2xf32>
   %1 = linalg.winograd_filter_transform m(4) r(3) ins(%arg1 : tensor<2x3x1x5xf32>) outs(%0 : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
   %2 = tensor.empty() : tensor<6x1x1x1x2x5xf32>
@@ -225,10 +265,11 @@ func.func @conv2d_mx1_rx1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
   %collapsed = tensor.collapse_shape %1 [[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %3 [[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
   %4 = tensor.empty() : tensor<6x2x2xf32>
-  %5 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-  %expanded = tensor.expand_shape %5 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-  %6 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
-  return %6 : tensor<2x4x1x2xf32>
+  %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+  %6 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%5 : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+  %expanded = tensor.expand_shape %6 [[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
+  %7 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x1x1x1x2x2xf32>) outs(%arg2 : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+  return %7 : tensor<2x4x1x2xf32>
 }
 
 module attributes {transform.with_named_sequence} {
@@ -252,41 +293,53 @@ module attributes {transform.with_named_sequence} {
 // CHECK-LABEL: func.func @conv2d_mx1_rx1
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
 // CHECK:   %[[CST:.*]] = arith.constant 3.200000e+01 : f32
+// CHECK:  %[[CST_0:.*]] = arith.constant dense<{{.*}}> : tensor<4x6xf32>
+// CHECK:  %[[CST_1:.*]] = arith.constant dense<{{.*}}> : tensor<6x6xf32>
+// CHECK:  %[[CST_2:.*]] = arith.constant dense<{{.*}}> : tensor<6x3xf32>
 // CHECK:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:   %[[C5:.*]] = arith.constant 5 : index
 // CHECK:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK:   %[[CST_3:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK:   %[[S0:.*]] = tensor.empty() : tensor<6x1x5x2xf32>
 // CHECK:   %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 3, 1, 1] [1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1]
+// CHECK:       %[[S8:.*]] = tensor.empty() : tensor<6x1xf32>
+// CHECK:       %[[S9:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S8]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.matmul ins(%[[CST_2]], %[[EXTRACTED_SLICE]] : tensor<6x3xf32>, tensor<3x1xf32>) outs(%[[S9]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S10]] into %[[ARG6]][0, 0, %[[ARG5]], %[[ARG3]]] [6, 1, 1, 1] [1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   %[[S2:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32>
 // CHECK:   %[[S3:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S2]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[ARG0]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 6, 1, 1] [1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S9]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
+// CHECK:       %[[S8:.*]] = tensor.empty() : tensor<6x1xf32>
+// CHECK:       %[[S9:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S8]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE]] : tensor<6x6xf32>, tensor<6x1xf32>) outs(%[[S9]] : tensor<6x1xf32>) -> tensor<6x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S10]] into %[[ARG6]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]]
 // CHECK:   %[[COLLAPSED_3:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]]
-// CHECK:   %[[S5:.*]] = linalg.batch_matmul
-// CHECK:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2]
+// CHECK:   %[[S4:.*]] = tensor.empty() : tensor<6x2x2xf32>
+// CHECK:   %[[S5:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S4]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_3]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S5]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2]
 // CHECK:   %[[S6:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[ARG2]])
 // CHECK:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]])
 // CHECK:       %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, 0, 0, %[[ARG3]], %[[ARG5]]] [6, 1, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1]
-// CHECK:       %[[S9:.*]] = linalg.matmul
-// CHECK:       %[[S10:.*]] = tensor.empty() : tensor<4x1xf32>
-// CHECK:       %[[S11:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S10]] : tensor<4x1xf32>) {
+// CHECK:       %[[S9:.*]] = tensor.empty() : tensor<4x1xf32>
+// CHECK:       %[[S10:.*]] = linalg.fill ins(%[[CST_3]] : f32) outs(%[[S9]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[S11:.*]] = linalg.matmul ins(%[[CST_0]], %[[EXTRACTED_SLICE]] : tensor<4x6xf32>, tensor<6x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[S12:.*]] = tensor.empty() : tensor<4x1xf32>
+// CHECK:       %[[S13:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S12]] : tensor<4x1xf32>) {
 // CHECK:       ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK:         linalg.yield %[[IN]] : f32
 // CHECK:       } -> tensor<4x1xf32>
-// CHECK:       %[[S12:.*]] = linalg.mul ins(%[[S11]], %[[S9]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S10]] : tensor<4x1xf32>) -> tensor<4x1xf32>
-// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S12]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
+// CHECK:       %[[S14:.*]] = linalg.mul ins(%[[S13]], %[[S11]] : tensor<4x1xf32>, tensor<4x1xf32>) outs(%[[S12]] : tensor<4x1xf32>) -> tensor<4x1xf32>
+// CHECK:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[ARG3]], 0, 0, %[[ARG5]]] [1, 4, 1, 1] [1, 1, 1, 1]
 // CHECK:       scf.yield %[[INSERTED_SLICE]]
 // CHECK:     scf.yield %[[S7]]
 // CHECK:   return %[[S6]]
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
index 095a6636b68dc..4369f5f1eab4c 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d-rewrite.mlir
@@ -13,14 +13,15 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
   %collapsed = tensor.collapse_shape %3 [[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
   %collapsed_0 = tensor.collapse_shape %5 [[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
   %6 = tensor.empty() : tensor<36x18x2xf32>
-  %7 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%6 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-  %expanded = tensor.expand_shape %7 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+  %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %8 = linalg.batch_matmul ins(%collapsed_0, %collapsed : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%7 : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+  %expanded = tensor.expand_shape %8 [[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
   %padded_1 = tensor.pad %arg2 low[0, 0, 0, 0] high[0, 3, 3, 0] {
   ^bb0(%arg3: index, %arg4: index, %arg5: index, %arg6: index):
     tensor.yield %cst : f32
   } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-  %8 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-  %extracted_slice = tensor.extract_slice %8[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+  %9 = linalg.winograd_output_transform m(4) r(3) ins(%expanded : tensor<6x6x3x3x2x2xf32>) outs(%padded_1 : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+  %extracted_slice = tensor.extract_slice %9[0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
   return %extracted_slice : tensor<2x9x9x2xf32>
 }
 
@@ -46,11 +47,13 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:   %[[S1:.*]] = scf.for %[[ARG3:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG4:.*]] = %[[S0]]) -> (tensor<6x6x5x2xf32>) {
 // CHECK-NEXT:     %[[S7:.*]] = scf.for %[[ARG5:.*]] = %[[C0]] to %[[C5]] step %[[C1]] iter_args(%[[ARG6:.*]] = %[[ARG4]]) -> (tensor<6x6x5x2xf32>) {
 // CHECK-NEXT:       %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[ARG1]][%[[ARG3]], %[[C0]], %[[C0]], %[[ARG5]]] [1, 3, 3, 1] [1, 1, 1, 1] : tensor<2x3x3x5xf32> to tensor<3x3xf32>
-// CHECK-NEXT:       %[[S8:.*]] = tensor.empty() : tensor<6x3xf32>
-// CHECK-NEXT:       %[[S9:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S8]] : tensor<6x3xf32>) -> tensor<6x3xf32>
-// CHECK-NEXT:       %[[S10:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:       %[[S11:.*]] = linalg.matmul ins(%[[S9]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S10]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S11]] into %[[ARG6]][%[[C0]], %[[C0]], %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x5x2xf32>
+// CHECK-NEXT:       %[[S9:.*]] = tensor.empty() : tensor<6x3xf32>
+// CHECK-NEXT:       %[[S10:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S9]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:       %[[S11:.*]] = linalg.matmul ins(%[[CST_5]], %[[EXTRACTED_SLICE_9]] : tensor<6x3xf32>, tensor<3x3xf32>) outs(%[[S10]] : tensor<6x3xf32>) -> tensor<6x3xf32>
+// CHECK-NEXT:       %[[S12:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:       %[[S13:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S12]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:       %[[S14:.*]] = linalg.matmul ins(%[[S11]], %[[CST_4]] : tensor<6x3xf32>, tensor<3x6xf32>) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:       %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S14]] into %[[ARG6]][%[[C0]], %[[C0]], %[[ARG5]], %[[ARG3]]] [6, 6, 1, 1] [1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x5x2xf32>
 // CHECK-NEXT:       scf.yield %[[INSERTED_SLICE]] : tensor<6x6x5x2xf32>
 // CHECK-NEXT:     }
 // CHECK-NEXT:     scf.yield %[[S7]] : tensor<6x6x5x2xf32>
@@ -67,11 +70,13 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:           %[[S10:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
 // CHECK-NEXT:           %[[S11:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
 // CHECK-NEXT:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[PADDED]][%[[ARG7]], %[[S10]], %[[S11]], %[[ARG9]]] [1, 6, 6, 1] [1, 1, 1, 1] : tensor<2x14x14x5xf32> to tensor<6x6xf32>
-// CHECK-NEXT:           %[[S12:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_9]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:           %[[S14:.*]] = tensor.empty() : tensor<6x6xf32>
-// CHECK-NEXT:           %[[S15:.*]] = linalg.matmul ins(%[[S13]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
-// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S15]] into %[[ARG10]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x3x3x2x5xf32>
+// CHECK-NEXT:           %[[S13:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:           %[[S14:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S13]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S15:.*]] = linalg.matmul ins(%[[CST_3]], %[[EXTRACTED_SLICE_9]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S14]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S16:.*]] = tensor.empty() : tensor<6x6xf32>
+// CHECK-NEXT:           %[[S17:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S16]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[S18:.*]] = linalg.matmul ins(%[[S15]], %[[CST_2]] : tensor<6x6xf32>, tensor<6x6xf32>) outs(%[[S17]] : tensor<6x6xf32>) -> tensor<6x6xf32>
+// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S18]] into %[[ARG10]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6xf32> into tensor<6x6x3x3x2x5xf32>
 // CHECK-NEXT:           scf.yield %[[INSERTED_SLICE]] : tensor<6x6x3x3x2x5xf32>
 // CHECK-NEXT:         }
 // CHECK-NEXT:         scf.yield %[[S9]] : tensor<6x6x3x3x2x5xf32>
@@ -83,8 +88,9 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_7:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_7]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_7]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S5]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
 // CHECK-NEXT:   %[[PADDED_8:.*]] = tensor.pad %[[ARG2]] low[0, 0, 0, 0] high[0, 3, 3, 0] {
 // CHECK-NEXT:   ^bb0(%[[ARG3:.*]]: index, %[[ARG4:.*]]: index, %[[ARG5:.*]]: index, %[[ARG6:.*]]: index):
 // CHECK-NEXT:     tensor.yield %[[CST_6]] : f32
@@ -94,19 +100,21 @@ func.func @conv2d(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5xf32>, %arg
 // CHECK-NEXT:       %[[S8:.*]] = scf.for %[[ARG7:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG8:.*]] = %[[ARG6]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:         %[[S9:.*]] = scf.for %[[ARG9:.*]] = %[[C0]] to %[[C2]] step %[[C1]] iter_args(%[[ARG10:.*]] = %[[ARG8]]) -> (tensor<2x12x12x2xf32>) {
 // CHECK-NEXT:           %[[EXTRACTED_SLICE_9:.*]] = tensor.extract_slice %[[EXPANDED]][0, 0, %[[ARG3]], %[[ARG5]], %[[ARG7]], %[[ARG9]]] [6, 6, 1, 1, 1, 1] [1, 1, 1, 1, 1, 1] : tensor<6x6x3x3x2x2xf32> to tensor<6x6xf32>
-// CHECK-NEXT:           %[[S10:.*]] = tensor.empty() : tensor<4x6xf32>
-// CHECK-NEXT:           %[[S11:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S10]] : tensor<4x6xf32>) -> tensor<4x6xf32>
-// CHECK-NEXT:           %[[S12:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[S11]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S12]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S11:.*]] = tensor.empty() : tensor<4x6xf32>
+// CHECK-NEXT:           %[[S12:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S11]] : tensor<4x6xf32>) -> tensor<4x6xf32>
+// CHECK-NEXT:           %[[S13:.*]] = linalg.matmul ins(%[[CST_1]], %[[EXTRACTED_SLICE_9]] : tensor<4x6xf32>, tensor<6x6xf32>) outs(%[[S12]] : tensor<4x6xf32>) -> tensor<4x6xf32>
 // CHECK-NEXT:           %[[S14:.*]] = tensor.empty() : tensor<4x4xf32>
-// CHECK-NEXT:           %[[S15:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S14]] : tensor<4x4xf32>) {
+// CHECK-NEXT:           %[[S15:.*]] = linalg.fill ins(%[[CST_6]] : f32) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S16:.*]] = linalg.matmul ins(%[[S13]], %[[CST_0]] : tensor<4x6xf32>, tensor<6x4xf32>) outs(%[[S15]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S17:.*]] = tensor.empty() : tensor<4x4xf32>
+// CHECK-NEXT:           %[[S18:.*]] = linalg.generic {indexing_maps = [#[[$MAP1]], #[[$MAP2]]], iterator_types = ["parallel", "parallel"]} ins(%[[CST]] : f32) outs(%[[S17]] : tensor<4x4xf32>) {
 // CHECK-NEXT:           ^bb0(%[[IN:.*]]: f32, %[[OUT:.*]]: f32):
 // CHECK-NEXT:             linalg.yield %[[IN]] : f32
 // CHECK-NEXT:           } -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S16:.*]] = linalg.mul ins(%[[S15]], %[[S13]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S14]] : tensor<4x4xf32>) -> tensor<4x4xf32>
-// CHECK-NEXT:           %[[S17:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
-// CHECK-NEXT:           %[[S18:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
-// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S16]] into %[[ARG10]][%[[ARG7]], %[[S17]], %[[S18]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
+// CHECK-NEXT:           %[[S19:.*]] = linalg.mul ins(%[[S18]], %[[S16]] : tensor<4x4xf32>, tensor<4x4xf32>) outs(%[[S17]] : tensor<4x4xf32>) -> tensor<4x4xf32>
+// CHECK-NEXT:           %[[S20:.*]] = affine.apply #[[$MAP0]](%[[ARG3]])
+// CHECK-NEXT:           %[[S21:.*]] = affine.apply #[[$MAP0]](%[[ARG5]])
+// CHECK-NEXT:           %[[INSERTED_SLICE:.*]] = tensor.insert_slice %[[S19]] into %[[ARG10]][%[[ARG7]], %[[S20]], %[[S21]], %[[ARG9]]] [1, 4, 4, 1] [1, 1, 1, 1] : tensor<4x4xf32> into tensor<2x12x12x2xf32>
 // CHECK-NEXT:           scf.yield %[[INSERTED_SLICE]] : tensor<2x12x12x2xf32>
 // CHECK-NEXT:         }
 // CHECK-NEXT:         scf.yield %[[S9]] : tensor<2x12x12x2xf32>
diff --git a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
index ec11a6ef8fbee..0040d81a2d24e 100644
--- a/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
+++ b/mlir/test/Dialect/Linalg/winograd-conv2d.mlir
@@ -7,6 +7,7 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_4x4_3x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
+// CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
@@ -14,10 +15,11 @@ func.func @conv2d_4x4_3x3(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x3x3x5xf32>
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:  %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:  %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
-// CHECK-NEXT:  return %[[S8]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:  return %[[S9]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -29,6 +31,7 @@ func.func @conv2d_2x2_5x5(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x5x5x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_2x2_5x5
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf32>, %[[ARG1:.*]]: tensor<2x5x5x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(2) r(5) ins(%[[ARG1]] : tensor<2x5x5x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf32>
@@ -36,10 +39,11 @@ func.func @conv2d_2x2_5x5(%arg0: tensor<2x6x6x5xf32>, %arg1: tensor<2x5x5x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf32> into tensor<36x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(2) r(5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x2x2x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(2) r(5) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x2x2x2xf32>) -> tensor<2x2x2x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x2x2x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -51,6 +55,7 @@ func.func @conv2d_1x4_1x3(%arg0: tensor<2x1x6x5xf32>, %arg1: tensor<2x1x3x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_1x4_1x3
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x1x6x5xf32>, %[[ARG1:.*]]: tensor<2x1x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<1x6x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x1x3x5xf32>) outs(%[[S2]] : tensor<1x6x5x2xf32>) -> tensor<1x6x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<1x6x1x1x2x5xf32>
@@ -58,10 +63,11 @@ func.func @conv2d_1x4_1x3(%arg0: tensor<2x1x6x5xf32>, %arg1: tensor<2x1x3x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<1x6x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<1x6x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [1, 6, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<1x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x1x4x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [1, 6, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<1x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<1x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x1x4x2xf32>) -> tensor<2x1x4x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x1x4x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -73,6 +79,7 @@ func.func @conv2d_4x1_3x1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
 
 // CHECK-LABEL: func.func @conv2d_4x1_3x1
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x1x5xf32>, %[[ARG1:.*]]: tensor<2x3x1x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32> {
+// CHECK-NEXT:   %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x1x5x2xf32>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x1x5xf32>) outs(%[[S2]] : tensor<6x1x5x2xf32>) -> tensor<6x1x5x2xf32>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<6x1x1x1x2x5xf32>
@@ -80,10 +87,11 @@ func.func @conv2d_4x1_3x1(%arg0: tensor<2x6x1x5xf32>, %arg1: tensor<2x3x1x5xf32>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x1x5x2xf32> into tensor<6x5x2xf32>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x1x1x1x2x5xf32> into tensor<6x2x5xf32>
 // CHECK-NEXT:   %[[S6:.*]] = tensor.empty() : tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
-// CHECK-NEXT:   return %[[S8]] : tensor<2x4x1x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<6x2x5xf32>, tensor<6x5x2xf32>) outs(%[[S7]] : tensor<6x2x2xf32>) -> tensor<6x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 1, 1, 1, 2, 2] : tensor<6x2x2xf32> into tensor<6x1x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x1x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x1x2xf32>) -> tensor<2x4x1x2xf32>
+// CHECK-NEXT:   return %[[S9]] : tensor<2x4x1x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -95,6 +103,7 @@ func.func @conv2d_aligned(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf3
 
 // CHECK-LABEL: func.func @conv2d_aligned
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x10x10x5xf32>, %[[ARG1:.*]]: tensor<2x3x3x5xf32>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32> {
+// CHECK-NEXT:  %[[CST:.*]] = arith.constant 0.000000e+00 : f32
 // CHECK-NEXT:  %[[S2:.*]] = tensor.empty() : tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S3:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf32>) outs(%[[S2]] : tensor<6x6x5x2xf32>) -> tensor<6x6x5x2xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<6x6x2x2x2x5xf32>
@@ -102,10 +111,11 @@ func.func @conv2d_aligned(%arg0: tensor<2x10x10x5xf32>, %arg1: tensor<2x3x3x5xf3
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x2x2x2x5xf32> into tensor<36x8x5xf32>
 // CHECK-NEXT:  %[[S6:.*]] = tensor.empty() : tensor<36x8x2xf32>
-// CHECK-NEXT:  %[[S7:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%[[S6]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S7]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
-// CHECK-NEXT:  %[[S8:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
-// CHECK-NEXT:  return %[[S8]] : tensor<2x8x8x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S6]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
+// CHECK-NEXT:  %[[S8:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x8x5xf32>, tensor<36x5x2xf32>) outs(%[[S7]] : tensor<36x8x2xf32>) -> tensor<36x8x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S8]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 2, 2, 2, 2] : tensor<36x8x2xf32> into tensor<6x6x2x2x2x2xf32>
+// CHECK-NEXT:  %[[S9:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x2x2x2x2xf32>) outs(%[[ARG3]] : tensor<2x8x8x2xf32>) -> tensor<2x8x8x2xf32>
+// CHECK-NEXT:  return %[[S9]] : tensor<2x8x8x2xf32>
 // CHECK-NEXT: }
 
 // -----
@@ -129,14 +139,15 @@ func.func @conv2d_unaligned(%arg0: tensor<2x11x11x5xf32>, %arg1: tensor<2x3x3x5x
 // CHECK-NEXT:  %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf32> into tensor<36x5x2xf32>
 // CHECK-NEXT:  %[[COLLAPSED_0:.*]] = tensor.collapse_shape %3 {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x3x3x2x5xf32> into tensor<36x18x5xf32>
 // CHECK-NEXT:  %[[S4:.*]] = tensor.empty() : tensor<36x18x2xf32>
-// CHECK-NEXT:  %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
-// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
+// CHECK-NEXT:  %[[S5:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S4]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:  %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x18x5xf32>, tensor<36x5x2xf32>) outs(%[[S5]] : tensor<36x18x2xf32>) -> tensor<36x18x2xf32>
+// CHECK-NEXT:  %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 3, 3, 2, 2] : tensor<36x18x2xf32> into tensor<6x6x3x3x2x2xf32>
 // CHECK-NEXT:  %[[PADDED_1:.*]] = tensor.pad %arg3 low[0, 0, 0, 0] high[0, 3, 3, 0] {
 // CHECK-NEXT:  ^bb0
 // CHECK-NEXT:    tensor.yield %[[CST]] : f32
 // CHECK-NEXT:  } : tensor<2x9x9x2xf32> to tensor<2x12x12x2xf32>
-// CHECK-NEXT:  %[[S6:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
-// CHECK-NEXT:  %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S6]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
+// CHECK-NEXT:  %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x3x3x2x2xf32>) outs(%[[PADDED_1]] : tensor<2x12x12x2xf32>) -> tensor<2x12x12x2xf32>
+// CHECK-NEXT:  %[[EXTRACTED_SLICE:.*]] = tensor.extract_slice %[[S7]][0, 0, 0, 0] [2, 9, 9, 2] [1, 1, 1, 1] : tensor<2x12x12x2xf32> to tensor<2x9x9x2xf32>
 // CHECK-NEXT:  return %[[EXTRACTED_SLICE]] : tensor<2x9x9x2xf32>
 // CHECK-NEXT: }
 
@@ -149,17 +160,19 @@ func.func @conv2d_type_promotion(%arg0: tensor<2x6x6x5xf16>, %arg1: tensor<2x3x3
 
 // CHECK-LABEL: func.func @conv2d_type_promotion
 // CHECK-SAME:  (%[[ARG0:.*]]: tensor<2x6x6x5xf16>, %[[ARG1:.*]]: tensor<2x3x3x5xf16>, %[[ARG2:.*]]: tensor<1xf32>, %[[ARG3:.*]]: tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32> {
-// CHECK:        %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
+// CHECK:        %[[CST:.*]] = arith.constant 0.000000e+00 : f32
+// CHECK-NEXT:   %[[S0:.*]] = tensor.empty() : tensor<6x6x5x2xf16>
 // CHECK-NEXT:   %[[S1:.*]] = linalg.winograd_filter_transform m(4) r(3) ins(%[[ARG1]] : tensor<2x3x3x5xf16>) outs(%[[S0]] : tensor<6x6x5x2xf16>) -> tensor<6x6x5x2xf16>
 // CHECK-NEXT:   %[[S2:.*]] = tensor.empty() : tensor<6x6x1x1x2x5xf16>
 // CHECK-NEXT:   %[[S3:.*]] = linalg.winograd_input_transform m(4) r(3) ins(%[[ARG0]] : tensor<2x6x6x5xf16>) outs(%[[S2]] : tensor<6x6x1x1x2x5xf16>) -> tensor<6x6x1x1x2x5xf16>
 // CHECK-NEXT:   %[[COLLAPSED:.*]] = tensor.collapse_shape %[[S1]] {{\[}}[0, 1], [2], [3]] : tensor<6x6x5x2xf16> into tensor<36x5x2xf16>
 // CHECK-NEXT:   %[[COLLAPSED_0:.*]] = tensor.collapse_shape %[[S3]] {{\[}}[0, 1], [2, 3, 4], [5]] : tensor<6x6x1x1x2x5xf16> into tensor<36x2x5xf16>
 // CHECK-NEXT:   %[[S4:.*]] = tensor.empty() : tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[S5:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[S4]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
-// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S5]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
-// CHECK-NEXT:   %[[S6:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
-// CHECK-NEXT:   return %[[S6]] : tensor<2x4x4x2xf32>
+// CHECK-NEXT:   %[[S5:.*]] = linalg.fill ins(%[[CST]] : f32) outs(%[[S4]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[S6:.*]] = linalg.batch_matmul ins(%[[COLLAPSED_0]], %[[COLLAPSED]] : tensor<36x2x5xf16>, tensor<36x5x2xf16>) outs(%[[S5]] : tensor<36x2x2xf32>) -> tensor<36x2x2xf32>
+// CHECK-NEXT:   %[[EXPANDED:.*]] = tensor.expand_shape %[[S6]] {{\[}}[0, 1], [2, 3, 4], [5]] output_shape [6, 6, 1, 1, 2, 2] : tensor<36x2x2xf32> into tensor<6x6x1x1x2x2xf32>
+// CHECK-NEXT:   %[[S7:.*]] = linalg.winograd_output_transform m(4) r(3) ins(%[[EXPANDED]] : tensor<6x6x1x1x2x2xf32>) outs(%[[ARG3]] : tensor<2x4x4x2xf32>) -> tensor<2x4x4x2xf32>
+// CHECK-NEXT:   return %[[S7]] : tensor<2x4x4x2xf32>
 // CHECK-NEXT: }
 
 // -----

From 387ef59ab9520c0811bf555679dd8ff0a60417c6 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:50:45 -0700
Subject: [PATCH 08/43] [clang][TableGen] Change TypeNodesEmitter to use const
 RecordKeeper (#108476)

Change TypeNodesEmitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/ClangTypeNodesEmitter.cpp | 13 ++++++-------
 clang/utils/TableGen/TableGenBackends.h        |  3 ++-
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/utils/TableGen/ClangTypeNodesEmitter.cpp b/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
index 66bdf5e67602b..41a2d0cd066fe 100644
--- a/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
+++ b/clang/utils/TableGen/ClangTypeNodesEmitter.cpp
@@ -74,16 +74,15 @@ using namespace clang::tblgen;
 
 namespace {
 class TypeNodeEmitter {
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   raw_ostream &Out;
-  const std::vector<Record*> Types;
+  ArrayRef<const Record *> Types;
   std::vector<StringRef> MacrosToUndef;
 
 public:
-  TypeNodeEmitter(RecordKeeper &records, raw_ostream &out)
-    : Records(records), Out(out),
-      Types(Records.getAllDerivedDefinitions(TypeNodeClassName)) {
-  }
+  TypeNodeEmitter(const RecordKeeper &records, raw_ostream &out)
+      : Records(records), Out(out),
+        Types(Records.getAllDerivedDefinitions(TypeNodeClassName)) {}
 
   void emit();
 
@@ -203,6 +202,6 @@ void TypeNodeEmitter::emitUndefs() {
   }
 }
 
-void clang::EmitClangTypeNodes(RecordKeeper &records, raw_ostream &out) {
+void clang::EmitClangTypeNodes(const RecordKeeper &records, raw_ostream &out) {
   TypeNodeEmitter(records, out).emit();
 }
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 01d16d2dc3e5f..79e9be6c03192 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -39,7 +39,8 @@ void EmitClangBasicReader(const llvm::RecordKeeper &Records,
                           llvm::raw_ostream &OS);
 void EmitClangBasicWriter(const llvm::RecordKeeper &Records,
                           llvm::raw_ostream &OS);
-void EmitClangTypeNodes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangTypeNodes(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
 void EmitClangTypeReader(const llvm::RecordKeeper &Records,
                          llvm::raw_ostream &OS);
 void EmitClangTypeWriter(const llvm::RecordKeeper &Records,

From f637273d7708da0a01c3adb2bb85ae1c541f46a1 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:51:17 -0700
Subject: [PATCH 09/43] [clang][TableGen] Change SACheckersEmitter to use const
 RecordKeeper (#108477)

Change SACheckersEmitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/ClangSACheckersEmitter.cpp | 8 +++++---
 clang/utils/TableGen/TableGenBackends.h         | 3 ++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/clang/utils/TableGen/ClangSACheckersEmitter.cpp b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
index 2a2e466ae1979..44c2d8b31655d 100644
--- a/clang/utils/TableGen/ClangSACheckersEmitter.cpp
+++ b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
@@ -174,9 +174,11 @@ static void printOption(llvm::raw_ostream &OS, StringRef FullName,
     OS << "true";
 }
 
-void clang::EmitClangSACheckers(RecordKeeper &Records, raw_ostream &OS) {
-  std::vector<Record*> checkers = Records.getAllDerivedDefinitions("Checker");
-  std::vector<Record*> packages = Records.getAllDerivedDefinitions("Package");
+void clang::EmitClangSACheckers(const RecordKeeper &Records, raw_ostream &OS) {
+  ArrayRef<const Record *> checkers =
+      Records.getAllDerivedDefinitions("Checker");
+  ArrayRef<const Record *> packages =
+      Records.getAllDerivedDefinitions("Package");
 
   using SortedRecords = llvm::StringMap<const Record *>;
 
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 79e9be6c03192..4e59653fac06d 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -94,7 +94,8 @@ void EmitClangDiagGroups(const llvm::RecordKeeper &Records,
 void EmitClangDiagsIndexName(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
 
-void EmitClangSACheckers(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitClangSACheckers(const llvm::RecordKeeper &Records,
+                         llvm::raw_ostream &OS);
 
 void EmitClangCommentHTMLTags(const llvm::RecordKeeper &Records,
                               llvm::raw_ostream &OS);

From d757bbf68f35dbcfd68580e3798cf301862dd314 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:51:42 -0700
Subject: [PATCH 10/43] [clang][TableGen] Change SyntaxEmitter to use const
 RecordKeeper (#108478)

Change SyntaxEmitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/ClangSyntaxEmitter.cpp | 13 +++++++------
 clang/utils/TableGen/TableGenBackends.h     |  4 ++--
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/utils/TableGen/ClangSyntaxEmitter.cpp b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
index 2a69e4c353b6b..66b27be88f56f 100644
--- a/clang/utils/TableGen/ClangSyntaxEmitter.cpp
+++ b/clang/utils/TableGen/ClangSyntaxEmitter.cpp
@@ -41,11 +41,12 @@ using llvm::formatv;
 // stable and useful way, where abstract Node subclasses correspond to ranges.
 class Hierarchy {
 public:
-  Hierarchy(llvm::RecordKeeper &Records) {
-    for (llvm::Record *T : Records.getAllDerivedDefinitions("NodeType"))
+  Hierarchy(const llvm::RecordKeeper &Records) {
+    for (const llvm::Record *T : Records.getAllDerivedDefinitions("NodeType"))
       add(T);
-    for (llvm::Record *Derived : Records.getAllDerivedDefinitions("NodeType"))
-      if (llvm::Record *Base = Derived->getValueAsOptionalDef("base"))
+    for (const llvm::Record *Derived :
+         Records.getAllDerivedDefinitions("NodeType"))
+      if (const llvm::Record *Base = Derived->getValueAsOptionalDef("base"))
         link(Derived, Base);
     for (NodeType &N : AllTypes) {
       llvm::sort(N.Derived, [](const NodeType *L, const NodeType *R) {
@@ -127,7 +128,7 @@ struct SyntaxConstraint {
 
 } // namespace
 
-void clang::EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
+void clang::EmitClangSyntaxNodeList(const llvm::RecordKeeper &Records,
                                     llvm::raw_ostream &OS) {
   llvm::emitSourceFileHeader("Syntax tree node list", OS, Records);
   Hierarchy H(Records);
@@ -186,7 +187,7 @@ static void printDoc(llvm::StringRef Doc, llvm::raw_ostream &OS) {
   }
 }
 
-void clang::EmitClangSyntaxNodeClasses(llvm::RecordKeeper &Records,
+void clang::EmitClangSyntaxNodeClasses(const llvm::RecordKeeper &Records,
                                        llvm::raw_ostream &OS) {
   llvm::emitSourceFileHeader("Syntax tree node list", OS, Records);
   Hierarchy H(Records);
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 4e59653fac06d..0e09c7917518f 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -110,9 +110,9 @@ void EmitClangCommentCommandList(const llvm::RecordKeeper &Records,
                                  llvm::raw_ostream &OS);
 void EmitClangOpcodes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitClangSyntaxNodeList(llvm::RecordKeeper &Records,
+void EmitClangSyntaxNodeList(const llvm::RecordKeeper &Records,
                              llvm::raw_ostream &OS);
-void EmitClangSyntaxNodeClasses(llvm::RecordKeeper &Records,
+void EmitClangSyntaxNodeClasses(const llvm::RecordKeeper &Records,
                                 llvm::raw_ostream &OS);
 
 void EmitNeon(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

From 75d87247871110efe83edcb0107176614f88a5d9 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:52:14 -0700
Subject: [PATCH 11/43] [clang][TableGen] Change MVE Emitter to use const
 RecordKeeper (#108500)

Change MVE Emitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/MveEmitter.cpp     | 71 +++++++++++++------------
 clang/utils/TableGen/TableGenBackends.h | 26 +++++----
 2 files changed, 52 insertions(+), 45 deletions(-)

diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index bb4f091604f5e..6cfaa891241fa 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -958,7 +958,7 @@ class ACLEIntrinsic {
            ";\n";
   }
 
-  ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param);
+  ACLEIntrinsic(EmitterBase &ME, const Record *R, const Type *Param);
 };
 
 // -----------------------------------------------------------------------------
@@ -988,7 +988,7 @@ class EmitterBase {
   const ScalarType *getScalarType(StringRef Name) {
     return ScalarTypes[std::string(Name)].get();
   }
-  const ScalarType *getScalarType(Record *R) {
+  const ScalarType *getScalarType(const Record *R) {
     return getScalarType(R->getName());
   }
   const VectorType *getVectorType(const ScalarType *ST, unsigned Lanes) {
@@ -1028,7 +1028,7 @@ class EmitterBase {
   // the Params list in the Tablegen record for the intrinsic), which is used
   // to expand Tablegen classes like 'Vector' which mean something different in
   // each member of a parametric family.
-  const Type *getType(Record *R, const Type *Param);
+  const Type *getType(const Record *R, const Type *Param);
   const Type *getType(DagInit *D, const Type *Param);
   const Type *getType(Init *I, const Type *Param);
 
@@ -1046,7 +1046,7 @@ class EmitterBase {
 
   // Constructor and top-level functions.
 
-  EmitterBase(RecordKeeper &Records);
+  EmitterBase(const RecordKeeper &Records);
   virtual ~EmitterBase() = default;
 
   virtual void EmitHeader(raw_ostream &OS) = 0;
@@ -1065,7 +1065,7 @@ const Type *EmitterBase::getType(Init *I, const Type *Param) {
   PrintFatalError("Could not convert this value into a type");
 }
 
-const Type *EmitterBase::getType(Record *R, const Type *Param) {
+const Type *EmitterBase::getType(const Record *R, const Type *Param) {
   // Pass to a subfield of any wrapper records. We don't expect more than one
   // of these: immediate operands are used as plain numbers rather than as
   // llvm::Value, so it's meaningless to promote their type anyway.
@@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
   // The meat of the getType system: types in the Tablegen are represented by a
   // dag whose operators select sub-cases of this function.
 
-  Record *Op = cast<DefInit>(D->getOperator())->getDef();
+  const Record *Op = cast<DefInit>(D->getOperator())->getDef();
   if (!Op->isSubClassOf("ComplexTypeOp"))
     PrintFatalError(
         "Expected ComplexTypeOp as dag operator in type expression");
@@ -1154,7 +1154,7 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
 
 Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
                                        const Type *Param) {
-  Record *Op = cast<DefInit>(D->getOperator())->getDef();
+  const Record *Op = cast<DefInit>(D->getOperator())->getDef();
 
   if (Op->getName() == "seq") {
     Result::Scope SubScope = Scope;
@@ -1211,7 +1211,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   } else if (Op->getName() == "unsignedflag") {
     if (D->getNumArgs() != 1)
       PrintFatalError("unsignedflag should have exactly one argument");
-    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    const Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
     if (!TypeRec->isSubClassOf("Type"))
       PrintFatalError("unsignedflag's argument should be a type");
     if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
@@ -1223,7 +1223,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   } else if (Op->getName() == "bitsize") {
     if (D->getNumArgs() != 1)
       PrintFatalError("bitsize should have exactly one argument");
-    Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
+    const Record *TypeRec = cast<DefInit>(D->getArg(0))->getDef();
     if (!TypeRec->isSubClassOf("Type"))
       PrintFatalError("bitsize's argument should be a type");
     if (const auto *ST = dyn_cast<ScalarType>(getType(TypeRec, Param))) {
@@ -1239,7 +1239,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     if (Op->isSubClassOf("IRBuilderBase")) {
       std::set<unsigned> AddressArgs;
       std::map<unsigned, std::string> IntegerArgs;
-      for (Record *sp : Op->getValueAsListOfDefs("special_params")) {
+      for (const Record *sp : Op->getValueAsListOfDefs("special_params")) {
         unsigned Index = sp->getValueAsInt("index");
         if (sp->isSubClassOf("IRBuilderAddrParam")) {
           AddressArgs.insert(Index);
@@ -1251,7 +1251,7 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
                                                Args, AddressArgs, IntegerArgs);
     } else if (Op->isSubClassOf("IRIntBase")) {
       std::vector<const Type *> ParamTypes;
-      for (Record *RParam : Op->getValueAsListOfDefs("params"))
+      for (const Record *RParam : Op->getValueAsListOfDefs("params"))
         ParamTypes.push_back(getType(RParam, Param));
       std::string IntName = std::string(Op->getValueAsString("intname"));
       if (Op->getValueAsBit("appendKind"))
@@ -1294,7 +1294,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
     return getCodeForDag(DI, Scope, Param);
 
   if (auto *DI = dyn_cast<DefInit>(Arg)) {
-    Record *Rec = DI->getDef();
+    const Record *Rec = DI->getDef();
     if (Rec->isSubClassOf("Type")) {
       const Type *T = getType(Rec, Param);
       return std::make_shared<TypeResult>(T);
@@ -1328,7 +1328,8 @@ Result::Ptr EmitterBase::getCodeForArg(unsigned ArgNum, const Type *ArgType,
   return V;
 }
 
-ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
+ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
+                             const Type *Param)
     : ReturnType(ME.getType(R->getValueAsDef("ret"), Param)) {
   // Derive the intrinsic's full name, by taking the name of the
   // Tablegen record (or override) and appending the suffix from its
@@ -1346,7 +1347,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   // full name as specified by its 'pnt' member ('polymorphic name type'),
   // which indicates how many type suffixes to remove, and any other piece of
   // the name that should be removed.
-  Record *PolymorphicNameType = R->getValueAsDef("pnt");
+  const Record *PolymorphicNameType = R->getValueAsDef("pnt");
   SmallVector<StringRef, 8> NameParts;
   StringRef(FullName).split(NameParts, '_');
   for (unsigned i = 0, e = PolymorphicNameType->getValueAsInt(
@@ -1393,11 +1394,11 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
     // what values it can take, for Sema checking.
     bool Immediate = false;
     if (auto TypeDI = dyn_cast<DefInit>(TypeInit)) {
-      Record *TypeRec = TypeDI->getDef();
+      const Record *TypeRec = TypeDI->getDef();
       if (TypeRec->isSubClassOf("Immediate")) {
         Immediate = true;
 
-        Record *Bounds = TypeRec->getValueAsDef("bounds");
+        const Record *Bounds = TypeRec->getValueAsDef("bounds");
         ImmediateArg &IA = ImmediateArgs[i];
         if (Bounds->isSubClassOf("IB_ConstRange")) {
           IA.boundsType = ImmediateArg::BoundsType::ExplicitRange;
@@ -1440,7 +1441,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   // Finally, go through the codegen dag and translate it into a Result object
   // (with an arbitrary DAG of depended-on Results hanging off it).
   DagInit *CodeDag = R->getValueAsDag("codegen");
-  Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
+  const Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
   if (MainOp->isSubClassOf("CustomCodegen")) {
     // Or, if it's the special case of CustomCodegen, just accumulate
     // a list of parameters we're going to assign to variables before
@@ -1464,7 +1465,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, Record *R, const Type *Param)
   }
 }
 
-EmitterBase::EmitterBase(RecordKeeper &Records) {
+EmitterBase::EmitterBase(const RecordKeeper &Records) {
   // Construct the whole EmitterBase.
 
   // First, look up all the instances of PrimitiveType. This gives us the list
@@ -1472,13 +1473,13 @@ EmitterBase::EmitterBase(RecordKeeper &Records) {
   // collect all the useful ScalarType instances into a big list so that we can
   // use it for operations such as 'find the unsigned version of this signed
   // integer type'.
-  for (Record *R : Records.getAllDerivedDefinitions("PrimitiveType"))
+  for (const Record *R : Records.getAllDerivedDefinitions("PrimitiveType"))
     ScalarTypes[std::string(R->getName())] = std::make_unique<ScalarType>(R);
 
   // Now go through the instances of Intrinsic, and for each one, iterate
   // through its list of type parameters making an ACLEIntrinsic for each one.
-  for (Record *R : Records.getAllDerivedDefinitions("Intrinsic")) {
-    for (Record *RParam : R->getValueAsListOfDefs("params")) {
+  for (const Record *R : Records.getAllDerivedDefinitions("Intrinsic")) {
+    for (const Record *RParam : R->getValueAsListOfDefs("params")) {
       const Type *Param = getType(RParam, getVoidType());
       auto Intrinsic = std::make_unique<ACLEIntrinsic>(*this, R, Param);
       ACLEIntrinsics[Intrinsic->fullName()] = std::move(Intrinsic);
@@ -1752,7 +1753,7 @@ void EmitterBase::GroupSemaChecks(
 
 class MveEmitter : public EmitterBase {
 public:
-  MveEmitter(RecordKeeper &Records) : EmitterBase(Records){};
+  MveEmitter(const RecordKeeper &Records) : EmitterBase(Records) {}
   void EmitHeader(raw_ostream &OS) override;
   void EmitBuiltinDef(raw_ostream &OS) override;
   void EmitBuiltinSema(raw_ostream &OS) override;
@@ -2010,14 +2011,14 @@ class CdeEmitter : public EmitterBase {
   std::map<StringRef, FunctionMacro> FunctionMacros;
 
 public:
-  CdeEmitter(RecordKeeper &Records);
+  CdeEmitter(const RecordKeeper &Records);
   void EmitHeader(raw_ostream &OS) override;
   void EmitBuiltinDef(raw_ostream &OS) override;
   void EmitBuiltinSema(raw_ostream &OS) override;
 };
 
-CdeEmitter::CdeEmitter(RecordKeeper &Records) : EmitterBase(Records) {
-  for (Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
+CdeEmitter::CdeEmitter(const RecordKeeper &Records) : EmitterBase(Records) {
+  for (const Record *R : Records.getAllDerivedDefinitions("FunctionMacro"))
     FunctionMacros.emplace(R->getName(), FunctionMacro(*R));
 }
 
@@ -2179,45 +2180,45 @@ namespace clang {
 
 // MVE
 
-void EmitMveHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveHeader(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitHeader(OS);
 }
 
-void EmitMveBuiltinDef(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinDef(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinDef(OS);
 }
 
-void EmitMveBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinSema(OS);
 }
 
-void EmitMveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinCG(OS);
 }
 
-void EmitMveBuiltinAliases(RecordKeeper &Records, raw_ostream &OS) {
+void EmitMveBuiltinAliases(const RecordKeeper &Records, raw_ostream &OS) {
   MveEmitter(Records).EmitBuiltinAliases(OS);
 }
 
 // CDE
 
-void EmitCdeHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeHeader(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitHeader(OS);
 }
 
-void EmitCdeBuiltinDef(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinDef(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinDef(OS);
 }
 
-void EmitCdeBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinSema(OS);
 }
 
-void EmitCdeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinCG(OS);
 }
 
-void EmitCdeBuiltinAliases(RecordKeeper &Records, raw_ostream &OS) {
+void EmitCdeBuiltinAliases(const RecordKeeper &Records, raw_ostream &OS) {
   CdeEmitter(Records).EmitBuiltinAliases(OS);
 }
 
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 0e09c7917518f..e6b2eacb8c704 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -137,22 +137,28 @@ void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitMveBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitMveHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitMveBuiltinDef(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitMveBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitMveBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitMveBuiltinAliases(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
 
 void EmitRVVHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitRVVBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitRVVBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitRVVBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitCdeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitCdeBuiltinAliases(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitCdeHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitCdeBuiltinDef(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitCdeBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitCdeBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitCdeBuiltinAliases(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
 
 void EmitClangAttrDocs(const llvm::RecordKeeper &Records,
                        llvm::raw_ostream &OS);

From a4b161736881634baac52163ceca62a595843054 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:52:37 -0700
Subject: [PATCH 12/43] [clang][TableGen] Change NeonEmitter to use const
 RecordKeeper (#108501)

Change NeonEmitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/NeonEmitter.cpp    | 61 ++++++++++++-------------
 clang/utils/TableGen/TableGenBackends.h | 12 ++---
 2 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index 4707ce1ea3b79..9e5480be20ada 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -59,7 +59,7 @@ namespace {
 // While globals are generally bad, this one allows us to perform assertions
 // liberally and somehow still trace them back to the def they indirectly
 // came from.
-static Record *CurrentRecord = nullptr;
+static const Record *CurrentRecord = nullptr;
 static void assert_with_loc(bool Assertion, const std::string &Str) {
   if (!Assertion) {
     if (CurrentRecord)
@@ -308,7 +308,7 @@ class Variable {
 /// a particular typespec and prototype.
 class Intrinsic {
   /// The Record this intrinsic was created from.
-  Record *R;
+  const Record *R;
   /// The unmangled name.
   std::string Name;
   /// The input and output typespecs. InTS == OutTS except when
@@ -371,7 +371,7 @@ class Intrinsic {
   }
 
 public:
-  Intrinsic(Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
+  Intrinsic(const Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
             TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
             StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable,
             bool BigEndianSafe)
@@ -442,7 +442,7 @@ class Intrinsic {
   }
 
   /// Get the Record that this intrinsic is based off.
-  Record *getRecord() const { return R; }
+  const Record *getRecord() const { return R; }
   /// Get the set of Intrinsics that this intrinsic calls.
   /// this is the set of immediate dependencies, NOT the
   /// transitive closure.
@@ -576,12 +576,12 @@ class Intrinsic {
 //===----------------------------------------------------------------------===//
 
 class NeonEmitter {
-  RecordKeeper &Records;
-  DenseMap<Record *, ClassKind> ClassMap;
+  const RecordKeeper &Records;
+  DenseMap<const Record *, ClassKind> ClassMap;
   std::map<std::string, std::deque<Intrinsic>> IntrinsicMap;
   unsigned UniqueNumber;
 
-  void createIntrinsic(Record *R, SmallVectorImpl<Intrinsic *> &Out);
+  void createIntrinsic(const Record *R, SmallVectorImpl<Intrinsic *> &Out);
   void genBuiltinsDef(raw_ostream &OS, SmallVectorImpl<Intrinsic *> &Defs);
   void genStreamingSVECompatibleList(raw_ostream &OS,
                                      SmallVectorImpl<Intrinsic *> &Defs);
@@ -601,15 +601,15 @@ class NeonEmitter {
   /// Called by Intrinsic - returns a globally-unique number.
   unsigned getUniqueNumber() { return UniqueNumber++; }
 
-  NeonEmitter(RecordKeeper &R) : Records(R), UniqueNumber(0) {
-    Record *SI = R.getClass("SInst");
-    Record *II = R.getClass("IInst");
-    Record *WI = R.getClass("WInst");
-    Record *SOpI = R.getClass("SOpInst");
-    Record *IOpI = R.getClass("IOpInst");
-    Record *WOpI = R.getClass("WOpInst");
-    Record *LOpI = R.getClass("LOpInst");
-    Record *NoTestOpI = R.getClass("NoTestOpInst");
+  NeonEmitter(const RecordKeeper &R) : Records(R), UniqueNumber(0) {
+    const Record *SI = R.getClass("SInst");
+    const Record *II = R.getClass("IInst");
+    const Record *WI = R.getClass("WInst");
+    const Record *SOpI = R.getClass("SOpInst");
+    const Record *IOpI = R.getClass("IOpInst");
+    const Record *WOpI = R.getClass("WOpInst");
+    const Record *LOpI = R.getClass("LOpInst");
+    const Record *NoTestOpI = R.getClass("NoTestOpInst");
 
     ClassMap[SI] = ClassS;
     ClassMap[II] = ClassI;
@@ -1979,12 +1979,12 @@ Intrinsic &NeonEmitter::getIntrinsic(StringRef Name, ArrayRef<Type> Types,
   return *GoodVec.front();
 }
 
-void NeonEmitter::createIntrinsic(Record *R,
+void NeonEmitter::createIntrinsic(const Record *R,
                                   SmallVectorImpl<Intrinsic *> &Out) {
   std::string Name = std::string(R->getValueAsString("Name"));
   std::string Proto = std::string(R->getValueAsString("Prototype"));
   std::string Types = std::string(R->getValueAsString("Types"));
-  Record *OperationRec = R->getValueAsDef("Operation");
+  const Record *OperationRec = R->getValueAsDef("Operation");
   bool BigEndianSafe  = R->getValueAsBit("BigEndianSafe");
   std::string ArchGuard = std::string(R->getValueAsString("ArchGuard"));
   std::string TargetGuard = std::string(R->getValueAsString("TargetGuard"));
@@ -2240,10 +2240,8 @@ void NeonEmitter::genIntrinsicRangeCheckCode(
 /// 2. the SemaChecking code for the type overload checking.
 /// 3. the SemaChecking code for validation of intrinsic immediate arguments.
 void NeonEmitter::runHeader(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-
   SmallVector<Intrinsic *, 128> Defs;
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   // Generate shared BuiltinsXXX.def
@@ -2402,8 +2400,7 @@ void NeonEmitter::run(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2510,8 +2507,7 @@ void NeonEmitter::runFP16(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2619,8 +2615,7 @@ void NeonEmitter::runBF16(raw_ostream &OS) {
         "__nodebug__))\n\n";
 
   SmallVector<Intrinsic *, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
-  for (auto *R : RV)
+  for (const Record *R : Records.getAllDerivedDefinitions("Inst"))
     createIntrinsic(R, Defs);
 
   for (auto *I : Defs)
@@ -2674,26 +2669,26 @@ void NeonEmitter::runBF16(raw_ostream &OS) {
   OS << "#endif\n";
 }
 
-void clang::EmitNeon(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeon(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).run(OS);
 }
 
-void clang::EmitFP16(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitFP16(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runFP16(OS);
 }
 
-void clang::EmitBF16(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitBF16(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runBF16(OS);
 }
 
-void clang::EmitNeonSema(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeonSema(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runHeader(OS);
 }
 
-void clang::EmitVectorTypes(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitVectorTypes(const RecordKeeper &Records, raw_ostream &OS) {
   NeonEmitter(Records).runVectorTypes(OS);
 }
 
-void clang::EmitNeonTest(RecordKeeper &Records, raw_ostream &OS) {
+void clang::EmitNeonTest(const RecordKeeper &Records, raw_ostream &OS) {
   llvm_unreachable("Neon test generation no longer implemented!");
 }
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index e6b2eacb8c704..030040b85e6bf 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -115,12 +115,12 @@ void EmitClangSyntaxNodeList(const llvm::RecordKeeper &Records,
 void EmitClangSyntaxNodeClasses(const llvm::RecordKeeper &Records,
                                 llvm::raw_ostream &OS);
 
-void EmitNeon(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitFP16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitBF16(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitNeonSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitVectorTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitNeonTest(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeon(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitFP16(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitBF16(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeonSema(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitVectorTypes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitNeonTest(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
 void EmitImmCheckTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitSveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

From 974fa8522be37eb0a111fee004b0bc8a9debf9fc Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:53:01 -0700
Subject: [PATCH 13/43] [clang][TableGen] Change RISCVVEmitter to use const
 RecordKeeper (#108502)

Change RISCVVEmitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/RISCVVEmitter.cpp  | 23 ++++++++++-------------
 clang/utils/TableGen/TableGenBackends.h |  9 +++++----
 2 files changed, 15 insertions(+), 17 deletions(-)

diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index d05236bb4e909..4ef83e7b608dc 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -95,11 +95,11 @@ class SemaSignatureTable {
 
 class RVVEmitter {
 private:
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   RVVTypeCache TypeCache;
 
 public:
-  RVVEmitter(RecordKeeper &R) : Records(R) {}
+  RVVEmitter(const RecordKeeper &R) : Records(R) {}
 
   /// Emit riscv_vector.h
   void createHeader(raw_ostream &o);
@@ -554,8 +554,7 @@ void RVVEmitter::createCodeGen(raw_ostream &OS) {
 void RVVEmitter::createRVVIntrinsics(
     std::vector<std::unique_ptr<RVVIntrinsic>> &Out,
     std::vector<SemaRecord> *SemaRecords) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("RVVBuiltin");
-  for (auto *R : RV) {
+  for (const Record *R : Records.getAllDerivedDefinitions("RVVBuiltin")) {
     StringRef Name = R->getValueAsString("Name");
     StringRef SuffixProto = R->getValueAsString("Suffix");
     StringRef OverloadedName = R->getValueAsString("OverloadedName");
@@ -565,10 +564,10 @@ void RVVEmitter::createRVVIntrinsics(
     bool HasMasked = R->getValueAsBit("HasMasked");
     bool HasMaskedOffOperand = R->getValueAsBit("HasMaskedOffOperand");
     bool HasVL = R->getValueAsBit("HasVL");
-    Record *MPSRecord = R->getValueAsDef("MaskedPolicyScheme");
+    const Record *MPSRecord = R->getValueAsDef("MaskedPolicyScheme");
     auto MaskedPolicyScheme =
         static_cast<PolicyScheme>(MPSRecord->getValueAsInt("Value"));
-    Record *UMPSRecord = R->getValueAsDef("UnMaskedPolicyScheme");
+    const Record *UMPSRecord = R->getValueAsDef("UnMaskedPolicyScheme");
     auto UnMaskedPolicyScheme =
         static_cast<PolicyScheme>(UMPSRecord->getValueAsInt("Value"));
     std::vector<int64_t> Log2LMULList = R->getValueAsListOfInts("Log2LMUL");
@@ -752,9 +751,7 @@ void RVVEmitter::createRVVIntrinsics(
 }
 
 void RVVEmitter::printHeaderCode(raw_ostream &OS) {
-  std::vector<Record *> RVVHeaders =
-      Records.getAllDerivedDefinitions("RVVHeader");
-  for (auto *R : RVVHeaders) {
+  for (const Record *R : Records.getAllDerivedDefinitions("RVVHeader")) {
     StringRef HeaderCodeStr = R->getValueAsString("HeaderCode");
     OS << HeaderCodeStr.str();
   }
@@ -822,19 +819,19 @@ void RVVEmitter::createSema(raw_ostream &OS) {
 }
 
 namespace clang {
-void EmitRVVHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVHeader(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createHeader(OS);
 }
 
-void EmitRVVBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createBuiltins(OS);
 }
 
-void EmitRVVBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createCodeGen(OS);
 }
 
-void EmitRVVBuiltinSema(RecordKeeper &Records, raw_ostream &OS) {
+void EmitRVVBuiltinSema(const RecordKeeper &Records, raw_ostream &OS) {
   RVVEmitter(Records).createSema(OS);
 }
 
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 030040b85e6bf..07cb2812fa884 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -146,10 +146,11 @@ void EmitMveBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinAliases(const llvm::RecordKeeper &Records,
                            llvm::raw_ostream &OS);
 
-void EmitRVVHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitRVVBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitRVVBuiltinSema(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
 
 void EmitCdeHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitCdeBuiltinDef(const llvm::RecordKeeper &Records,

From ab06a18b59eddfa0719faa1fe40e83829939c6db Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Sep 2024 07:53:23 -0700
Subject: [PATCH 14/43] [IRSim] Avoid repeated hash lookups (NFC) (#108483)

---
 llvm/lib/Analysis/IRSimilarityIdentifier.cpp | 12 ++----------
 1 file changed, 2 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
index 41815c633fdf2..42e986e6179dd 100644
--- a/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
+++ b/llvm/lib/Analysis/IRSimilarityIdentifier.cpp
@@ -1420,16 +1420,8 @@ void IRSimilarityIdentifier::findCandidates(
         // IRSimilarityCandidates that include that instruction.
         for (IRSimilarityCandidate &IRCand : SimilarityCandidates->back()) {
           for (unsigned Idx = IRCand.getStartIdx(), Edx = IRCand.getEndIdx();
-               Idx <= Edx; ++Idx) {
-            DenseMap<unsigned, DenseSet<IRSimilarityCandidate *>>::iterator
-                IdIt;
-            IdIt = IndexToIncludedCand.find(Idx);
-            bool Inserted = false;
-            if (IdIt == IndexToIncludedCand.end())
-              std::tie(IdIt, Inserted) = IndexToIncludedCand.insert(
-                  std::make_pair(Idx, DenseSet<IRSimilarityCandidate *>()));
-            IdIt->second.insert(&IRCand);
-          }
+               Idx <= Edx; ++Idx)
+            IndexToIncludedCand[Idx].insert(&IRCand);
           // Add mapping of candidate to the overall similarity group number.
           CandToGroup.insert(
               std::make_pair(&IRCand, SimilarityCandidates->size() - 1));

From 711278e273826f3903cef448f433fe2135c569c6 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Fri, 13 Sep 2024 07:53:30 -0700
Subject: [PATCH 15/43] [clang][TableGen] Change SVE Emitter to use const
 RecordKeeper (#108503)

Change SVE Emitter to use const RecordKeeper.

This is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/SveEmitter.cpp     | 52 ++++++++++++-------------
 clang/utils/TableGen/TableGenBackends.h | 34 +++++++++-------
 2 files changed, 46 insertions(+), 40 deletions(-)

diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b2e2db1a40990..5abf6fc49bc30 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -280,7 +280,7 @@ class SVEEmitter {
 
   static const std::array<ReinterpretTypeInfo, 12> Reinterprets;
 
-  RecordKeeper &Records;
+  const RecordKeeper &Records;
   llvm::StringMap<uint64_t> EltTypes;
   llvm::StringMap<uint64_t> MemEltTypes;
   llvm::StringMap<uint64_t> FlagTypes;
@@ -288,7 +288,7 @@ class SVEEmitter {
   llvm::StringMap<uint64_t> ImmCheckTypes;
 
 public:
-  SVEEmitter(RecordKeeper &R) : Records(R) {
+  SVEEmitter(const RecordKeeper &R) : Records(R) {
     for (auto *RV : Records.getAllDerivedDefinitions("EltType"))
       EltTypes[RV->getNameInitAsString()] = RV->getValueAsInt("Value");
     for (auto *RV : Records.getAllDerivedDefinitions("MemEltType"))
@@ -397,7 +397,7 @@ class SVEEmitter {
   void createBuiltinZAState(raw_ostream &OS);
 
   /// Create intrinsic and add it to \p Out
-  void createIntrinsic(Record *R,
+  void createIntrinsic(const Record *R,
                        SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out);
 };
 
@@ -1151,7 +1151,7 @@ uint64_t SVEEmitter::encodeTypeFlags(const SVEType &T) {
 }
 
 void SVEEmitter::createIntrinsic(
-    Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
+    const Record *R, SmallVectorImpl<std::unique_ptr<Intrinsic>> &Out) {
   StringRef Name = R->getValueAsString("Name");
   StringRef Proto = R->getValueAsString("Prototype");
   StringRef Types = R->getValueAsString("Types");
@@ -1225,7 +1225,7 @@ void SVEEmitter::createCoreHeaderIntrinsics(raw_ostream &OS,
                                             SVEEmitter &Emitter,
                                             ACLEKind Kind) {
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   for (auto *R : RV)
     createIntrinsic(R, Defs);
 
@@ -1427,7 +1427,7 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
 }
 
 void SVEEmitter::createBuiltins(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1469,7 +1469,7 @@ void SVEEmitter::createBuiltins(raw_ostream &OS) {
 }
 
 void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1502,7 +1502,7 @@ void SVEEmitter::createCodeGenMap(raw_ostream &OS) {
 }
 
 void SVEEmitter::createRangeChecks(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1634,7 +1634,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMEBuiltins(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1662,7 +1662,7 @@ void SVEEmitter::createSMEBuiltins(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMECodeGenMap(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1696,7 +1696,7 @@ void SVEEmitter::createSMECodeGenMap(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV) {
     createIntrinsic(R, Defs);
@@ -1733,7 +1733,7 @@ void SVEEmitter::createSMERangeChecks(raw_ostream &OS) {
 }
 
 void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1773,7 +1773,7 @@ void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
 }
 
 void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
-  std::vector<Record *> RV = Records.getAllDerivedDefinitions("Inst");
+  std::vector<const Record *> RV = Records.getAllDerivedDefinitions("Inst");
   SmallVector<std::unique_ptr<Intrinsic>, 128> Defs;
   for (auto *R : RV)
     createIntrinsic(R, Defs);
@@ -1826,55 +1826,55 @@ void SVEEmitter::createStreamingAttrs(raw_ostream &OS, ACLEKind Kind) {
 }
 
 namespace clang {
-void EmitSveHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveHeader(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createHeader(OS);
 }
 
-void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createBuiltins(OS);
 }
 
-void EmitSveBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createCodeGenMap(OS);
 }
 
-void EmitSveRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveRangeChecks(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createRangeChecks(OS);
 }
 
-void EmitSveTypeFlags(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveTypeFlags(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createTypeFlags(OS);
 }
 
-void EmitImmCheckTypes(RecordKeeper &Records, raw_ostream &OS) {
+void EmitImmCheckTypes(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createImmCheckTypes(OS);
 }
 
-void EmitSveStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSveStreamingAttrs(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SVE);
 }
 
-void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeHeader(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMEHeader(OS);
 }
 
-void EmitSmeBuiltins(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltins(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMEBuiltins(OS);
 }
 
-void EmitSmeBuiltinCG(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltinCG(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMECodeGenMap(OS);
 }
 
-void EmitSmeRangeChecks(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeRangeChecks(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createSMERangeChecks(OS);
 }
 
-void EmitSmeStreamingAttrs(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeStreamingAttrs(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createStreamingAttrs(OS, ACLEKind::SME);
 }
 
-void EmitSmeBuiltinZAState(RecordKeeper &Records, raw_ostream &OS) {
+void EmitSmeBuiltinZAState(const RecordKeeper &Records, raw_ostream &OS) {
   SVEEmitter(Records).createBuiltinZAState(OS);
 }
 } // End namespace clang
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 07cb2812fa884..f7527ac535a87 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -122,20 +122,26 @@ void EmitNeonSema(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitVectorTypes(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitNeonTest(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
-void EmitImmCheckTypes(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSveStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-
-void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltinCG(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeStreamingAttrs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
-void EmitSmeBuiltinZAState(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitImmCheckTypes(const llvm::RecordKeeper &Records,
+                       llvm::raw_ostream &OS);
+void EmitSveHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveTypeFlags(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSveRangeChecks(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitSveStreamingAttrs(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+
+void EmitSmeHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltins(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeBuiltinCG(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+void EmitSmeRangeChecks(const llvm::RecordKeeper &Records,
+                        llvm::raw_ostream &OS);
+void EmitSmeStreamingAttrs(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
+void EmitSmeBuiltinZAState(const llvm::RecordKeeper &Records,
+                           llvm::raw_ostream &OS);
 
 void EmitMveHeader(const llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 void EmitMveBuiltinDef(const llvm::RecordKeeper &Records,

From 99fe5954d258511ec2e36e8c7f612568e9701ab7 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 13 Sep 2024 10:58:39 -0400
Subject: [PATCH 16/43] [libc] implement clock_gettime using vDSO (#108458)

supersedes https://github.com/llvm/llvm-project/pull/91805
---
 libc/src/__support/time/linux/CMakeLists.txt  |  1 +
 libc/src/__support/time/linux/clock_gettime.h | 35 +++++++++++++++----
 2 files changed, 29 insertions(+), 7 deletions(-)

diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt
index 1b41c7cb0a98a..4297a02986668 100644
--- a/libc/src/__support/time/linux/CMakeLists.txt
+++ b/libc/src/__support/time/linux/CMakeLists.txt
@@ -9,6 +9,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.error_or
     libc.src.__support.OSUtil.osutil
+    libc.src.__support.OSUtil.vdso
 )
 
 add_header_library(
diff --git a/libc/src/__support/time/linux/clock_gettime.h b/libc/src/__support/time/linux/clock_gettime.h
index eca1ba70de592..517cca91391a7 100644
--- a/libc/src/__support/time/linux/clock_gettime.h
+++ b/libc/src/__support/time/linux/clock_gettime.h
@@ -11,26 +11,47 @@
 
 #include "hdr/types/clockid_t.h"
 #include "hdr/types/struct_timespec.h"
+#include "src/__support/OSUtil/linux/vdso.h"
 #include "src/__support/OSUtil/syscall.h"
 #include "src/__support/common.h"
 #include "src/__support/error_or.h"
 #include "src/__support/macros/config.h"
 #include <sys/syscall.h>
 
+#if defined(SYS_clock_gettime64)
+#include <linux/time_types.h>
+#endif
+
 namespace LIBC_NAMESPACE_DECL {
 namespace internal {
 LIBC_INLINE ErrorOr<int> clock_gettime(clockid_t clockid, timespec *ts) {
-#if SYS_clock_gettime
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime,
-                                              static_cast<long>(clockid),
-                                              reinterpret_cast<long>(ts));
+  using namespace vdso;
+  int ret;
+#if defined(SYS_clock_gettime)
+  TypedSymbol<VDSOSym::ClockGetTime> clock_gettime;
+  if (LIBC_LIKELY(clock_gettime != nullptr))
+    ret = clock_gettime(clockid, ts);
+  else
+    ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime,
+                                            static_cast<long>(clockid),
+                                            reinterpret_cast<long>(ts));
 #elif defined(SYS_clock_gettime64)
   static_assert(
       sizeof(time_t) == sizeof(int64_t),
       "SYS_clock_gettime64 requires struct timespec with 64-bit members.");
-  int ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime64,
-                                              static_cast<long>(clockid),
-                                              reinterpret_cast<long>(ts));
+
+  TypedSymbol<VDSOSym::ClockGetTime64> clock_gettime64;
+  __kernel_timespec ts64{};
+  if (LIBC_LIKELY(clock_gettime64 != nullptr))
+    ret = clock_gettime64(clockid, &ts64);
+  else
+    ret = LIBC_NAMESPACE::syscall_impl<int>(SYS_clock_gettime64,
+                                            static_cast<long>(clockid),
+                                            reinterpret_cast<long>(&ts64));
+  if (ret == 0) {
+    ts->tv_sec = static_cast<decltype(ts->tv_sec)>(ts64.tv_sec);
+    ts->tv_nsec = static_cast<decltype(ts->tv_nsec)>(ts64.tv_nsec);
+  }
 #else
 #error "SYS_clock_gettime and SYS_clock_gettime64 syscalls not available."
 #endif

From ea5d37f4c1fd9c0850bee35958568a8b6596b3f9 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 13 Sep 2024 17:05:02 +0200
Subject: [PATCH 17/43] [LLD][COFF] Add Support for ARM64EC Import Thunks
 (#108460)

ARM64EC import thunks function similarly to regular ARM64 thunks but use
a mangled name and perform the call through the auxiliary IAT.
---
 lld/COFF/Chunks.h                 | 10 ++++--
 lld/COFF/InputFiles.cpp           | 11 +++++--
 lld/COFF/InputFiles.h             |  1 +
 lld/COFF/Writer.cpp               | 22 +++++++++----
 lld/test/COFF/arm64ec-import.test | 55 ++++++++++++++++++-------------
 5 files changed, 64 insertions(+), 35 deletions(-)

diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 8ad17a2850782..24d7c37de7f3b 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -601,13 +601,17 @@ class ImportThunkChunkARM : public ImportThunkChunk {
 
 class ImportThunkChunkARM64 : public ImportThunkChunk {
 public:
-  explicit ImportThunkChunkARM64(COFFLinkerContext &ctx, Defined *s)
-      : ImportThunkChunk(ctx, s) {
+  explicit ImportThunkChunkARM64(COFFLinkerContext &ctx, Defined *s,
+                                 MachineTypes machine)
+      : ImportThunkChunk(ctx, s), machine(machine) {
     setAlignment(4);
   }
   size_t getSize() const override { return sizeof(importThunkARM64); }
   void writeTo(uint8_t *buf) const override;
-  MachineTypes getMachine() const override { return ARM64; }
+  MachineTypes getMachine() const override { return machine; }
+
+private:
+  MachineTypes machine;
 };
 
 // ARM64EC __impchk_* thunk implementation.
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index ee39b46624444..94ad7f3ceb306 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1018,7 +1018,7 @@ ImportThunkChunk *ImportFile::makeImportThunk() {
   case I386:
     return make<ImportThunkChunkX86>(ctx, impSym);
   case ARM64:
-    return make<ImportThunkChunkARM64>(ctx, impSym);
+    return make<ImportThunkChunkARM64>(ctx, impSym, ARM64);
   case ARMNT:
     return make<ImportThunkChunkARM>(ctx, impSym);
   }
@@ -1109,7 +1109,14 @@ void ImportFile::parse() {
     } else {
       thunkSym = ctx.symtab.addImportThunk(
           name, impSym, make<ImportThunkChunkX64>(ctx, impSym));
-      // FIXME: Add aux IAT symbols.
+
+      if (std::optional<std::string> mangledName =
+              getArm64ECMangledFunctionName(name)) {
+        StringRef auxThunkName = saver().save(*mangledName);
+        auxThunkSym = ctx.symtab.addImportThunk(
+            auxThunkName, impECSym,
+            make<ImportThunkChunkARM64>(ctx, impECSym, ARM64EC));
+      }
 
       StringRef impChkName = saver().save("__impchk_" + name);
       impchkThunk = make<ImportThunkChunkARM64EC>(this);
diff --git a/lld/COFF/InputFiles.h b/lld/COFF/InputFiles.h
index 0812e9c461045..acf221d85ae8f 100644
--- a/lld/COFF/InputFiles.h
+++ b/lld/COFF/InputFiles.h
@@ -365,6 +365,7 @@ class ImportFile : public InputFile {
   // Auxiliary IAT symbol and chunk on ARM64EC.
   DefinedImportData *impECSym = nullptr;
   Chunk *auxLocation = nullptr;
+  Symbol *auxThunkSym = nullptr;
 
   // We want to eliminate dllimported symbols if no one actually refers to them.
   // These "Live" bits are used to keep track of which import library members
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 0b3c4163020f4..216db652c10aa 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1252,14 +1252,22 @@ void Writer::appendImportThunks() {
     if (!file->live)
       continue;
 
-    if (!file->thunkSym)
-      continue;
+    if (file->thunkSym) {
+      if (!isa<DefinedImportThunk>(file->thunkSym))
+        fatal(toString(ctx, *file->thunkSym) + " was replaced");
+      auto *chunk = cast<DefinedImportThunk>(file->thunkSym)->getChunk();
+      if (chunk->live)
+        textSec->addChunk(chunk);
+    }
+
+    if (file->auxThunkSym) {
+      if (!isa<DefinedImportThunk>(file->auxThunkSym))
+        fatal(toString(ctx, *file->auxThunkSym) + " was replaced");
+      auto *chunk = cast<DefinedImportThunk>(file->auxThunkSym)->getChunk();
+      if (chunk->live)
+        textSec->addChunk(chunk);
+    }
 
-    if (!isa<DefinedImportThunk>(file->thunkSym))
-      fatal(toString(ctx, *file->thunkSym) + " was replaced");
-    DefinedImportThunk *thunk = cast<DefinedImportThunk>(file->thunkSym);
-    if (thunk->getChunk()->live)
-      textSec->addChunk(thunk->getChunk());
     if (file->impchkThunk)
       textSec->addChunk(file->impchkThunk);
   }
diff --git a/lld/test/COFF/arm64ec-import.test b/lld/test/COFF/arm64ec-import.test
index f8279cefc3bcf..e403daa41f368 100644
--- a/lld/test/COFF/arm64ec-import.test
+++ b/lld/test/COFF/arm64ec-import.test
@@ -39,25 +39,31 @@ RUN: llvm-objdump -d out2.dll | FileCheck --check-prefix=DISASM %s
 
 DISASM:      180001000: 52800000     mov     w0, #0x0                // =0
 DISASM-NEXT: 180001004: d65f03c0     ret
-DISASM-NEXT: 180001008: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 18000100c: f940056b     ldr     x11, [x11, #0x8]
-DISASM-NEXT: 180001010: 9000000a     adrp    x10, 0x180001000 <.text>
-DISASM-NEXT: 180001014: 9101114a     add     x10, x10, #0x44
-DISASM-NEXT: 180001018: 17fffffa     b       0x180001000 <.text>
-DISASM-NEXT: 18000101c: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 180001020: f940096b     ldr     x11, [x11, #0x10]
-DISASM-NEXT: 180001024: f0ffffea     adrp    x10, 0x180000000
-DISASM-NEXT: 180001028: 9100014a     add     x10, x10, #0x0
-DISASM-NEXT: 18000102c: 17fffff5     b       0x180001000 <.text>
-DISASM-NEXT: 180001030: d000000b     adrp    x11, 0x180003000
-DISASM-NEXT: 180001034: f940116b     ldr     x11, [x11, #0x20]
-DISASM-NEXT: 180001038: 9000000a     adrp    x10, 0x180001000 <.text>
-DISASM-NEXT: 18000103c: 9101314a     add     x10, x10, #0x4c
-DISASM-NEXT: 180001040: 17fffff0     b       0x180001000 <.text>
-DISASM-NEXT: 180001044: 52800020     mov     w0, #0x1                // =1
-DISASM-NEXT: 180001048: d65f03c0     ret
-DISASM-NEXT: 18000104c: 52800040     mov     w0, #0x2                // =2
-DISASM-NEXT: 180001050: d65f03c0     ret
+DISASM-NEXT: 180001008: 90000030     adrp    x16, 0x180005000
+DISASM-NEXT: 18000100c: f9400610     ldr     x16, [x16, #0x8]
+DISASM-NEXT: 180001010: d61f0200     br      x16
+DISASM-NEXT: 180001014: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 180001018: f940056b     ldr     x11, [x11, #0x8]
+DISASM-NEXT: 18000101c: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001020: 9101714a     add     x10, x10, #0x5c
+DISASM-NEXT: 180001024: 17fffff7     b       0x180001000 <.text>
+DISASM-NEXT: 180001028: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000102c: f940096b     ldr     x11, [x11, #0x10]
+DISASM-NEXT: 180001030: f0ffffea     adrp    x10, 0x180000000
+DISASM-NEXT: 180001034: 9100014a     add     x10, x10, #0x0
+DISASM-NEXT: 180001038: 17fffff2     b       0x180001000 <.text>
+DISASM-NEXT: 18000103c: 90000030     adrp    x16, 0x180005000
+DISASM-NEXT: 180001040: f9401210     ldr     x16, [x16, #0x20]
+DISASM-NEXT: 180001044: d61f0200     br      x16
+DISASM-NEXT: 180001048: d000000b     adrp    x11, 0x180003000
+DISASM-NEXT: 18000104c: f940116b     ldr     x11, [x11, #0x20]
+DISASM-NEXT: 180001050: 9000000a     adrp    x10, 0x180001000 <.text>
+DISASM-NEXT: 180001054: 9101914a     add     x10, x10, #0x64
+DISASM-NEXT: 180001058: 17ffffea     b       0x180001000 <.text>
+DISASM-NEXT: 18000105c: 52800020     mov     w0, #0x1                // =1
+DISASM-NEXT: 180001060: d65f03c0     ret
+DISASM-NEXT: 180001064: 52800040     mov     w0, #0x2                // =2
+DISASM-NEXT: 180001068: d65f03c0     ret
 DISASM-NEXT:                 ...
 DISASM-NEXT: 180002000: ff 25 02 10 00 00            jmpq    *0x1002(%rip)           # 0x180003008
 
@@ -65,7 +71,8 @@ RUN: llvm-readobj --hex-dump=.test out.dll | FileCheck --check-prefix=TESTSEC %s
 RUN: llvm-readobj --hex-dump=.test out2.dll | FileCheck --check-prefix=TESTSEC %s
 TESTSEC:      0x180007000 08500000 00300000 10500000 20500000
 TESTSEC-NEXT: 0x180007010 08300000 00500000 10300000 20300000
-TESTSEC-NEXT: 0x180007020 08100000 1c100000 00200000
+TESTSEC-NEXT: 0x180007020 14100000 28100000 00200000 08100000
+TESTSEC-NEXT: 0x180007030 3c100000
 
 RUN: llvm-readobj --headers out.dll | FileCheck -check-prefix=HEADERS %s
 HEADERS:  LoadConfigTableRVA: 0x4010
@@ -76,9 +83,9 @@ RUN: llvm-readobj --coff-load-config out.dll | FileCheck -check-prefix=LOADCONFI
 LOADCONFIG: AuxiliaryIAT: 0x5000
 
 RUN: llvm-readobj --hex-dump=.rdata out.dll | FileCheck -check-prefix=RDATA %s
-RDATA:      0x180005000 00000000 00000000 08100080 01000000
-RDATA-NEXT: 0x180005010 1c100080 01000000 00000000 00000000
-RDATA-NEXT: 0x180005020 30100080 01000000 00000000 00000000
+RDATA:      0x180005000 00000000 00000000 14100080 01000000
+RDATA-NEXT: 0x180005010 28100080 01000000 00000000 00000000
+RDATA-NEXT: 0x180005020 48100080 01000000 00000000 00000000
 
 RUN: llvm-readobj --coff-basereloc out.dll | FileCheck -check-prefix=BASERELOC %s
 BASERELOC:      BaseReloc [
@@ -110,6 +117,8 @@ arm64ec_data_sym:
     .rva __impchk_func
     .rva __impchk_func2
     .rva func
+    .rva "#func"
+    .rva "#t2func"
 
 #--- icall.s
     .text

From a6438360d416f4529574eebf6aa65b80d48ef85e Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 13 Sep 2024 11:10:10 -0400
Subject: [PATCH 18/43] [libc] fix build issue in overlay mode (#108583)

---
 libc/src/__support/OSUtil/linux/vdso_sym.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/OSUtil/linux/vdso_sym.h b/libc/src/__support/OSUtil/linux/vdso_sym.h
index eb5f204a82f30..968e1536c4d27 100644
--- a/libc/src/__support/OSUtil/linux/vdso_sym.h
+++ b/libc/src/__support/OSUtil/linux/vdso_sym.h
@@ -44,8 +44,8 @@ template <VDSOSym sym> LIBC_INLINE constexpr auto dispatcher() {
   else if constexpr (sym == VDSOSym::ClockGetTime64)
     return static_cast<int (*)(clockid_t, __kernel_timespec *)>(nullptr);
   else if constexpr (sym == VDSOSym::GetTimeOfDay)
-    return static_cast<int (*)(timeval *__restrict, timezone *__restrict)>(
-        nullptr);
+    return static_cast<int (*)(timeval *__restrict,
+                               struct timezone *__restrict)>(nullptr);
   else if constexpr (sym == VDSOSym::GetCpu)
     return static_cast<int (*)(unsigned *, unsigned *, getcpu_cache *)>(
         nullptr);

From ff1de24a16c8f4dddc4381df00fe15e42891508b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Fri, 13 Sep 2024 15:37:44 +0000
Subject: [PATCH 19/43] [llvm-exegesis] Remove getter for RegNameToRegNo
 mapping

This patch removes the getter for the mentioned mapping. This was only
kept around to keep things in sync for some downstream codebases (that
didn't even end up needing it), so removing it now that it is not needed
anymore.
---
 llvm/tools/llvm-exegesis/lib/LlvmState.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/LlvmState.h b/llvm/tools/llvm-exegesis/lib/LlvmState.h
index e42393edb636d..f69d76c9a1e4e 100644
--- a/llvm/tools/llvm-exegesis/lib/LlvmState.h
+++ b/llvm/tools/llvm-exegesis/lib/LlvmState.h
@@ -76,14 +76,6 @@ class LLVMState {
     return *OpcodeNameToOpcodeIdxMapping;
   };
 
-  // TODO(boomanaiden154): We are keeping this getter around to enable internal
-  // migration to getRegisterNumberFromName. Once that is complete and
-  // the changes have been pulled, we can remove this.
-  const DenseMap<StringRef, MCRegister> &getRegNameToRegNoMapping() const {
-    assert(RegNameToRegNoMapping);
-    return *RegNameToRegNoMapping;
-  }
-
   std::optional<MCRegister>
   getRegisterNumberFromName(StringRef RegisterName) const;
 

From ffeb793f3a90623ab3c7f33f922d48a1f5f97cdf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Fri, 13 Sep 2024 17:51:28 +0200
Subject: [PATCH 20/43] [clang][analyzer][docs] Fix documentation of checker
 'StackAddrAsyncEscape' (NFC) (#108586)

The checker was indicated as a 'C' language checker but is only applicable to 'ObjC' code.
---
 clang/docs/analyzer/checkers.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 847bf4baf7488..c124fefc78611 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -2571,8 +2571,8 @@ with the `offsetof` macro.
 
 .. _alpha-core-StackAddressAsyncEscape:
 
-alpha.core.StackAddressAsyncEscape (C)
-""""""""""""""""""""""""""""""""""""""
+alpha.core.StackAddressAsyncEscape (ObjC)
+"""""""""""""""""""""""""""""""""""""""""
 Check that addresses to stack memory do not escape the function that involves dispatch_after or dispatch_async.
 This checker is a part of ``core.StackAddressEscape``, but is temporarily disabled until some false positives are fixed.
 

From 213c59ddd2a702ddd3d849cea250440b1ed718e0 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Fri, 13 Sep 2024 09:02:31 -0700
Subject: [PATCH 21/43] [lldb] Add pc check for thread-step-by-bp algorithms
 (#108504)

lldb-server built with NativeProcessLinux.cpp and
NativeProcessFreeBSD.cpp can use breakpoints to implement instruction
stepping on cores where there is no native instruction-step primitive.
Currently these set a breakpoint, continue, and if we hit the breakpoint
with the original thread, set the stop reason to be "trace".

I am wrapping up a change to lldb's breakpoint algorithm where I change
its current behavior of

"if a thread stops at a breakpoint site, we set
the thread's stop reason to breakpoint-hit, even if the breakpoint
hasn't been executed" +
"when resuming any thread at a breakpoint site, instruction-step past
the breakpoint before resuming"

to a behavior of

"when a thread executes a breakpoint, set the stop reason to
breakpoint-hit" +
"when a thread has hit a breakpoint, when the thread resumes, we
silently step past the breakpoint and then resume the thread".

For these lldb-server targets doing breakpoint stepping, this means that
if we are sitting on a breakpoint that has not yet executed, and
instruction-step the thread, we will execute the breakpoint instruction
at $pc (instead of $next-pc where it meant to go), and stop again -- at
the same pc value. Then we will rewrite the stop reason to 'trace'. The
higher level logic will see that we haven't hit the breakpoint
instruction again, so it will try to instruction step again, hitting the
breakpoint again forever.

To fix this, I'm checking that the thread matches the one we are
instruction-stepping-by-breakpoint AND that we've stopped at the
breakpoint address we are stepping to. Only in that case will the stop
reason be rewritten to "trace" hiding the implementation detail that the
step was done by breakpoints.
---
 .../Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp       | 5 ++++-
 lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp   | 7 +++++--
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
index 97fff4b9f65a8..80b27571f43d5 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeProcessFreeBSD.cpp
@@ -319,9 +319,12 @@ void NativeProcessFreeBSD::MonitorSIGTRAP(lldb::pid_t pid) {
                info.pl_siginfo.si_addr);
 
       if (thread) {
+        auto &regctx = static_cast<NativeRegisterContextFreeBSD &>(
+            thread->GetRegisterContext());
         auto thread_info =
             m_threads_stepping_with_breakpoint.find(thread->GetID());
-        if (thread_info != m_threads_stepping_with_breakpoint.end()) {
+        if (thread_info != m_threads_stepping_with_breakpoint.end() &&
+            threads_info->second == regctx.GetPC()) {
           thread->SetStoppedByTrace();
           Status brkpt_error = RemoveBreakpoint(thread_info->second);
           if (brkpt_error.Fail())
diff --git a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
index 5c262db8db7fd..38b7092682873 100644
--- a/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
+++ b/lldb/source/Plugins/Process/Linux/NativeProcessLinux.cpp
@@ -829,8 +829,11 @@ void NativeProcessLinux::MonitorBreakpoint(NativeThreadLinux &thread) {
   thread.SetStoppedByBreakpoint();
   FixupBreakpointPCAsNeeded(thread);
 
-  if (m_threads_stepping_with_breakpoint.find(thread.GetID()) !=
-      m_threads_stepping_with_breakpoint.end())
+  NativeRegisterContextLinux &reg_ctx = thread.GetRegisterContext();
+  auto stepping_with_bp_it =
+      m_threads_stepping_with_breakpoint.find(thread.GetID());
+  if (stepping_with_bp_it != m_threads_stepping_with_breakpoint.end() &&
+      stepping_with_bp_it->second == reg_ctx.GetPC())
     thread.SetStoppedByTrace();
 
   StopRunningThreads(thread.GetID());

From 65a4d11b1e67429d53df1fcee0f93492aa95c448 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Fri, 13 Sep 2024 09:04:28 -0700
Subject: [PATCH 22/43] [lldb] Set the stop reason when receiving
 swbreak/hwbreak (#108518)

xusheng added support for swbreak/hwbreak a month ago, and no special
support was needed in ProcessGDBRemote when they're received because
lldb already marks a thread as having hit a breakpoint when it stops at
a breakpoint site. However, with changes I am working on, we need to
know the real stop reason a thread stopped or the breakpoint hit will
not be recognized.

This is similar to how lldb processes the "watch/rwatch/awatch" keys in
a thread stop packet -- we set the `reason` to `watchpoint`, and these
set it to `breakpoint` so we set the stop reason correctly later in
these methods.
---
 lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 5eaf9ce2a302a..271ff61a7188a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -2317,6 +2317,8 @@ StateType ProcessGDBRemote::SetThreadStopInfo(StringExtractor &stop_packet) {
         StreamString ostr;
         ostr.Printf("%" PRIu64, wp_addr);
         description = std::string(ostr.GetString());
+      } else if (key.compare("swbreak") == 0 || key.compare("hwbreak") == 0) {
+        reason = "breakpoint";
       } else if (key.compare("library") == 0) {
         auto error = LoadModules();
         if (error) {

From 51f552568901b069f04edd9eacac89be452bfadf Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hghristov.rmm@gmail.com>
Date: Fri, 13 Sep 2024 19:05:20 +0300
Subject: [PATCH 23/43] [libc++][NFC] Mark P1869R1 as implemented (#107746)

https://wg21.link/p1869r1: Rename `condition_variable_any` interruptible
wait methods

The paper was implemented as experimental feature in Clang 18 in:
https://github.com/llvm/llvm-project/commit/4fa812bb52a5b1eea22750a1b59f94221d0df622

Experimental status removed in:
https://github.com/llvm/llvm-project/pull/107900

Closes https://github.com/llvm/llvm-project/issues/100031

---------

Co-authored-by: Hristo Hristov <zingam@outlook.com>
---
 libcxx/docs/Status/Cxx20Papers.csv | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index b3c26933a9c2a..d449c9d39c3b3 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -152,7 +152,7 @@
 "`P1855R0 <https://wg21.link/P1855R0>`__","Make ``<compare>``\  freestanding","2019-11 (Belfast)","","",""
 "`P1862R1 <https://wg21.link/P1862R1>`__","Ranges adaptors for non-copyable iterators","2019-11 (Belfast)","|Complete|","16.0",""
 "`P1865R1 <https://wg21.link/P1865R1>`__","Add max() to latch and barrier","2019-11 (Belfast)","|Complete|","11.0",""
-"`P1869R1 <https://wg21.link/P1869R1>`__","Rename 'condition_variable_any' interruptible wait methods","2019-11 (Belfast)","","",""
+"`P1869R1 <https://wg21.link/P1869R1>`__","Rename 'condition_variable_any' interruptible wait methods","2019-11 (Belfast)","|Complete|","18.0",""
 "`P1870R1 <https://wg21.link/P1870R1>`__","forwarding-range is too subtle","2019-11 (Belfast)","|Complete|","15.0",""
 "`P1871R1 <https://wg21.link/P1871R1>`__","Concept traits should be named after concepts","2019-11 (Belfast)","|Complete|","14.0",""
 "`P1872R0 <https://wg21.link/P1872R0>`__","span should have size_type, not index_type","2019-11 (Belfast)","|Complete|","10.0",""

From 82987bd9da1fd22a9d05148e8f74142aaf7ec4d0 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Fri, 13 Sep 2024 12:13:27 -0400
Subject: [PATCH 24/43] [libc] fix dependency path for vDSO (#108591)

---
 libc/src/__support/time/linux/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/time/linux/CMakeLists.txt b/libc/src/__support/time/linux/CMakeLists.txt
index 4297a02986668..f038cb8854b9b 100644
--- a/libc/src/__support/time/linux/CMakeLists.txt
+++ b/libc/src/__support/time/linux/CMakeLists.txt
@@ -9,7 +9,7 @@ add_header_library(
     libc.src.__support.common
     libc.src.__support.error_or
     libc.src.__support.OSUtil.osutil
-    libc.src.__support.OSUtil.vdso
+    libc.src.__support.OSUtil.linux.vdso
 )
 
 add_header_library(

From cd6844c45cbe9a5c4cf055142c4026dc408a8243 Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Fri, 13 Sep 2024 12:15:11 -0400
Subject: [PATCH 25/43] [PowerPC][NFC] autogen mma tc checks via
 update_cc_test_checks (#108584)

Update mma tests in prep for changes needed in a followup patch for
https://github.com/llvm/llvm-project/issues/107229.

Checks for ``clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c``
seem to have been manually upated to rename temp variables even though
it says checks was auto generated. Regenerate via script.

Add noopt checks for
``clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c``.
---
 .../PowerPC/builtins-ppc-build-pair-mma.c     |  68 ++++++
 .../PowerPC/builtins-ppc-pair-mma-types.c     | 204 +++++++++---------
 2 files changed, 170 insertions(+), 102 deletions(-)

diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
index 471a31a8c5eac..8a2bc93dd6cd0 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-build-pair-mma.c
@@ -3,6 +3,8 @@
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE
 // RUN: %clang_cc1 -O3 -triple powerpc64-unknown-unknown -target-cpu pwr10 \
 // RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-BE
+// RUN: %clang_cc1 -O0 -triple powerpc64le-unknown-unknown -target-cpu pwr10 \
+// RUN: -emit-llvm %s -o - | FileCheck %s --check-prefix=CHECK-LE-NOOPT
 
 // CHECK-LE-LABEL: @test1(
 // CHECK-LE-NEXT:  entry:
@@ -16,6 +18,42 @@
 // CHECK-BE-NEXT:    store <512 x i1> [[TMP0]], ptr [[RESP:%.*]], align 64, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
+// CHECK-LE-NOOPT-LABEL: @test1(
+// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC2_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC3_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC4_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[RESP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC3:%.*]], ptr [[VC3_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC4:%.*]], ptr [[VC4_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC3_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC4_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP8:%.*]] = call <512 x i1> @llvm.ppc.mma.assemble.acc(<16 x i8> [[TMP7]], <16 x i8> [[TMP6]], <16 x i8> [[TMP5]], <16 x i8> [[TMP4]])
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP8]], ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP9:%.*]] = load <512 x i1>, ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP10:%.*]] = load ptr, ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP9]], ptr [[TMP10]], align 64
+// CHECK-LE-NOOPT-NEXT:    ret void
+//
 void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vector unsigned char vc2,
             vector unsigned char vc3, vector unsigned char vc4, unsigned char *resp) {
   __vector_quad vq = *((__vector_quad *)vqp);
@@ -37,6 +75,36 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1, vec
 // CHECK-BE-NEXT:    store <256 x i1> [[TMP0]], ptr [[RESP:%.*]], align 32, !tbaa [[TBAA6:![0-9]+]]
 // CHECK-BE-NEXT:    ret void
 //
+// CHECK-LE-NOOPT-LABEL: @test2(
+// CHECK-LE-NOOPT-NEXT:  entry:
+// CHECK-LE-NOOPT-NEXT:    [[VQP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VPP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VC1_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[VC2_ADDR:%.*]] = alloca <16 x i8>, align 16
+// CHECK-LE-NOOPT-NEXT:    [[RESP_ADDR:%.*]] = alloca ptr, align 8
+// CHECK-LE-NOOPT-NEXT:    [[VQ:%.*]] = alloca <512 x i1>, align 64
+// CHECK-LE-NOOPT-NEXT:    [[VP:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    [[RES:%.*]] = alloca <256 x i1>, align 32
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VQP:%.*]], ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store ptr [[VPP:%.*]], ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC1:%.*]], ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store <16 x i8> [[VC2:%.*]], ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    store ptr [[RESP:%.*]], ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VQP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP1:%.*]] = load <512 x i1>, ptr [[TMP0]], align 64
+// CHECK-LE-NOOPT-NEXT:    store <512 x i1> [[TMP1]], ptr [[VQ]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC1_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC2_ADDR]], align 16
+// CHECK-LE-NOOPT-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP5]], <16 x i8> [[TMP4]])
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP6]], ptr [[RES]], align 64
+// CHECK-LE-NOOPT-NEXT:    [[TMP7:%.*]] = load <256 x i1>, ptr [[RES]], align 32
+// CHECK-LE-NOOPT-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[RESP_ADDR]], align 8
+// CHECK-LE-NOOPT-NEXT:    store <256 x i1> [[TMP7]], ptr [[TMP8]], align 32
+// CHECK-LE-NOOPT-NEXT:    ret void
+//
 void test2(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc1,
             vector unsigned char vc2, unsigned char *resp) {
   __vector_quad vq = *((__vector_quad *)vqp);
diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
index a414a2827b2c4..39c040967dc0c 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma-types.c
@@ -16,18 +16,18 @@
 // CHECK-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[VQP]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <512 x i1>, ptr [[TMP2]], align 64
-// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ1]], align 64
-// CHECK-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-NEXT:    store <512 x i1> [[TMP4]], ptr [[VQ2]], align 64
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
+// CHECK-NEXT:    store <512 x i1> [[TMP2]], ptr [[VQ1]], align 64
+// CHECK-NEXT:    [[TMP3:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ2]], align 64
+// CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-// CHECK-NEXT:    store <512 x i1> [[TMP7]], ptr [[VQ3]], align 64
-// CHECK-NEXT:    [[TMP8:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
-// CHECK-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-NEXT:    store <512 x i1> [[TMP8]], ptr [[TMP9]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-NEXT:    store <512 x i1> [[TMP6]], ptr [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP7:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
+// CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-NEXT:    store <512 x i1> [[TMP7]], ptr [[TMP8]], align 64
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVQLocal(
@@ -42,18 +42,18 @@
 // CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-BE-NEXT:    store ptr [[TMP0]], ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <512 x i1>, ptr [[TMP2]], align 64
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ1]], align 64
-// CHECK-BE-NEXT:    [[TMP4:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP4]], ptr [[VQ2]], align 64
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP2]], ptr [[VQ1]], align 64
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = call <512 x i1> @llvm.ppc.mma.xxsetaccz()
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP3]], ptr [[VQ2]], align 64
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP5]], <16 x i8> [[TMP6]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP7]], ptr [[VQ3]], align 64
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = load ptr, ptr [[VQP]], align 8
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP8]], ptr [[TMP9]], align 64
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = call <512 x i1> @llvm.ppc.mma.xvi4ger8(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP6]], ptr [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = load <512 x i1>, ptr [[VQ3]], align 64
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[VQP]], align 8
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP7]], ptr [[TMP8]], align 64
 // CHECK-BE-NEXT:    ret void
 //
 void testVQLocal(int *ptr, vector unsigned char vc) {
@@ -79,24 +79,24 @@ void testVQLocal(int *ptr, vector unsigned char vc) {
 // CHECK-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-NEXT:    store ptr [[TMP0]], ptr [[VPP]], align 8
-// CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
-// CHECK-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP1]], align 32
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-NEXT:    [[TMP2:%.*]] = load <256 x i1>, ptr [[TMP1]], align 32
+// CHECK-NEXT:    store <256 x i1> [[TMP2]], ptr [[VP1]], align 32
+// CHECK-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-// CHECK-NEXT:    store <256 x i1> [[TMP6]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK-NEXT:    store <256 x i1> [[TMP5]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP8]], <16 x i8> [[TMP7]])
-// CHECK-NEXT:    store <256 x i1> [[TMP9]], ptr [[VP2]], align 64
-// CHECK-NEXT:    [[TMP10:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
-// CHECK-NEXT:    store <512 x i1> [[TMP12]], ptr [[VQ]], align 64
-// CHECK-NEXT:    [[TMP13:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-NEXT:    store <256 x i1> [[TMP13]], ptr [[TMP14]], align 32
+// CHECK-NEXT:    [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP6]])
+// CHECK-NEXT:    store <256 x i1> [[TMP8]], ptr [[VP2]], align 64
+// CHECK-NEXT:    [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
+// CHECK-NEXT:    [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]])
+// CHECK-NEXT:    store <512 x i1> [[TMP11]], ptr [[VQ]], align 64
+// CHECK-NEXT:    [[TMP12:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-NEXT:    store <256 x i1> [[TMP12]], ptr [[TMP13]], align 32
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVPLocal(
@@ -112,24 +112,24 @@ void testVQLocal(int *ptr, vector unsigned char vc) {
 // CHECK-BE-NEXT:    store <16 x i8> [[VC:%.*]], ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[PTR_ADDR]], align 8
 // CHECK-BE-NEXT:    store ptr [[TMP0]], ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <256 x i1>, ptr [[TMP2]], align 32
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP3]], ptr [[VP1]], align 32
+// CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-BE-NEXT:    [[TMP2:%.*]] = load <256 x i1>, ptr [[TMP1]], align 32
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP2]], ptr [[VP1]], align 32
+// CHECK-BE-NEXT:    [[TMP3:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP4:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP4]], <16 x i8> [[TMP5]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP6]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP3]], <16 x i8> [[TMP4]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP5]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
 // CHECK-BE-NEXT:    [[TMP7:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP7]], <16 x i8> [[TMP8]])
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP9]], ptr [[VP2]], align 64
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP10]], <16 x i8> [[TMP11]])
-// CHECK-BE-NEXT:    store <512 x i1> [[TMP12]], ptr [[VQ]], align 64
-// CHECK-BE-NEXT:    [[TMP13:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
-// CHECK-BE-NEXT:    [[TMP14:%.*]] = load ptr, ptr [[VPP]], align 8
-// CHECK-BE-NEXT:    store <256 x i1> [[TMP13]], ptr [[TMP14]], align 32
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = call <256 x i1> @llvm.ppc.vsx.assemble.pair(<16 x i8> [[TMP6]], <16 x i8> [[TMP7]])
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP8]], ptr [[VP2]], align 64
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = load <16 x i8>, ptr [[VC_ADDR]], align 16
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = call <512 x i1> @llvm.ppc.mma.xvf64ger(<256 x i1> [[TMP9]], <16 x i8> [[TMP10]])
+// CHECK-BE-NEXT:    store <512 x i1> [[TMP11]], ptr [[VQ]], align 64
+// CHECK-BE-NEXT:    [[TMP12:%.*]] = load <256 x i1>, ptr [[VP3]], align 32
+// CHECK-BE-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[VPP]], align 8
+// CHECK-BE-NEXT:    store <256 x i1> [[TMP12]], ptr [[TMP13]], align 32
 // CHECK-BE-NEXT:    ret void
 //
 void testVPLocal(int *ptr, vector unsigned char vc) {
@@ -154,18 +154,18 @@ void testVPLocal(int *ptr, vector unsigned char vc) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testRestrictQualifiedPointer2(
@@ -178,18 +178,18 @@ void testVPLocal(int *ptr, vector unsigned char vc) {
 // CHECK-BE-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-BE-NEXT:    ret void
 //
 void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
@@ -207,18 +207,18 @@ void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
 // CHECK-NEXT:    [[TMP1:%.*]] = load volatile ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-NEXT:    ret void
 //
 // CHECK-BE-LABEL: @testVolatileQualifiedPointer2(
@@ -231,18 +231,18 @@ void testRestrictQualifiedPointer2(__vector_quad *__restrict acc) {
 // CHECK-BE-NEXT:    [[TMP1:%.*]] = load volatile ptr, ptr [[ACC_ADDR]], align 8
 // CHECK-BE-NEXT:    [[TMP2:%.*]] = load <512 x i1>, ptr [[TMP1]], align 64
 // CHECK-BE-NEXT:    [[TMP3:%.*]] = call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.ppc.mma.disassemble.acc(<512 x i1> [[TMP2]])
-// CHECK-BE-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
-// CHECK-BE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
-// CHECK-BE-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
-// CHECK-BE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
-// CHECK-BE-NEXT:    [[TMP9:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
-// CHECK-BE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 16
-// CHECK-BE-NEXT:    [[TMP11:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
-// CHECK-BE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
-// CHECK-BE-NEXT:    store <16 x i8> [[TMP11]], ptr [[TMP12]], align 16
+// CHECK-BE-NEXT:    [[TMP4:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 0
+// CHECK-BE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 0
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP5]], align 16
+// CHECK-BE-NEXT:    [[TMP6:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 1
+// CHECK-BE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 1
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP7]], align 16
+// CHECK-BE-NEXT:    [[TMP8:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 2
+// CHECK-BE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 2
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP9]], align 16
+// CHECK-BE-NEXT:    [[TMP10:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP3]], 3
+// CHECK-BE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds <16 x i8>, ptr [[ARRAYDECAY]], i32 3
+// CHECK-BE-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP11]], align 16
 // CHECK-BE-NEXT:    ret void
 //
 void testVolatileQualifiedPointer2(__vector_quad *__volatile acc) {

From 661382f2c07ba464caa0ad0fb8c64c1c3b20e9a4 Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Fri, 13 Sep 2024 09:17:06 -0700
Subject: [PATCH 26/43] [LLDB][Minidump] Minidump erase file on failure
 (#108259)

In #95312 Minidump file creation was moved from being created at the
end, to the file being emitted in chunks. This causes some undesirable
behavior where the file can still be present after an error has
occurred. To resolve this we will now delete the file upon an error.
---
 .../Minidump/MinidumpFileBuilder.cpp          | 12 ++++++++
 .../ObjectFile/Minidump/MinidumpFileBuilder.h |  3 ++
 .../Minidump/ObjectFileMinidump.cpp           | 18 ++++++++++++
 .../TestProcessSaveCoreMinidump.py            | 29 +++++++++++++++++++
 4 files changed, 62 insertions(+)

diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index edc568a6b47e0..ca22dacb2ba6c 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -1218,3 +1218,15 @@ Status MinidumpFileBuilder::DumpFile() {
 
   return error;
 }
+
+void MinidumpFileBuilder::DeleteFile() noexcept {
+  Log *log = GetLog(LLDBLog::Object);
+
+  if (m_core_file) {
+    Status error = m_core_file->Close();
+    if (error.Fail())
+      LLDB_LOGF(log, "Failed to close minidump file: %s", error.AsCString());
+
+    m_core_file.reset();
+  }
+}
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index 71001e26c00e9..72e5658718b3c 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -115,6 +115,9 @@ class MinidumpFileBuilder {
   // Run cleanup and write all remaining bytes to file
   lldb_private::Status DumpFile();
 
+  // Delete the file if it exists
+  void DeleteFile() noexcept;
+
 private:
   // Add data to the end of the buffer, if the buffer exceeds the flush level,
   // trigger a flush.
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
index 5da69dd4f2ce7..be47991bb09fc 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/ObjectFileMinidump.cpp
@@ -55,6 +55,21 @@ size_t ObjectFileMinidump::GetModuleSpecifications(
   return 0;
 }
 
+struct DumpFailRemoveHolder {
+  DumpFailRemoveHolder(MinidumpFileBuilder &builder) : m_builder(builder) {}
+
+  ~DumpFailRemoveHolder() {
+    if (!m_success)
+      m_builder.DeleteFile();
+  }
+
+  void SetSuccess() { m_success = true; }
+
+private:
+  MinidumpFileBuilder &m_builder;
+  bool m_success = false;
+};
+
 bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
                                   lldb_private::SaveCoreOptions &options,
                                   lldb_private::Status &error) {
@@ -75,6 +90,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
   }
   MinidumpFileBuilder builder(std::move(maybe_core_file.get()), process_sp,
                               options);
+  DumpFailRemoveHolder request(builder);
 
   Log *log = GetLog(LLDBLog::Object);
   error = builder.AddHeaderAndCalculateDirectories();
@@ -133,5 +149,7 @@ bool ObjectFileMinidump::SaveCore(const lldb::ProcessSP &process_sp,
     return false;
   }
 
+  request.SetSuccess();
+
   return true;
 }
diff --git a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
index 2cbe20ee10b1a..ccdb6653cf16f 100644
--- a/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
+++ b/lldb/test/API/functionalities/process_save_core_minidump/TestProcessSaveCoreMinidump.py
@@ -493,3 +493,32 @@ def test_save_minidump_custom_save_style_duplicated_regions(self):
 
         finally:
             self.assertTrue(self.dbg.DeleteTarget(target))
+
+    @skipUnlessPlatform(["linux"])
+    def minidump_deleted_on_save_failure(self):
+        """Test that verifies the minidump file is deleted after an error"""
+
+        self.build()
+        exe = self.getBuildArtifact("a.out")
+        try:
+            target = self.dbg.CreateTarget(exe)
+            process = target.LaunchSimple(
+                None, None, self.get_process_working_directory()
+            )
+            self.assertState(process.GetState(), lldb.eStateStopped)
+
+            custom_file = self.getBuildArtifact("core.should.be.deleted.custom.dmp")
+            options = lldb.SBSaveCoreOptions()
+            options.SetOutputFile(lldb.SBFileSpec(custom_file))
+            options.SetPluginName("minidump")
+            options.SetStyle(lldb.eSaveCoreCustomOnly)
+            # We set custom only and have no thread list and have no memory.
+            error = process.SaveCore(options)
+            self.assertTrue(error.Fail())
+            self.assertIn(
+                "no valid address ranges found for core style", error.GetCString()
+            )
+            self.assertTrue(not os.path.isfile(custom_file))
+
+        finally:
+            self.assertTrue(self.dbg.DeleteTarget(target))

From f0f1b706e2333ecbe3027a3da5ae7b1ff5c1cfc4 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 13 Sep 2024 09:28:49 -0700
Subject: [PATCH 27/43] [SandboxIR][PassRegistry] Parse pipeline string
 (#108103)

This patch implements a simple version of the pipeline parsing function.
It currently only handles a single FPM and adds function passes to it.
---
 llvm/include/llvm/SandboxIR/PassManager.h |  4 +++
 llvm/lib/SandboxIR/PassManager.cpp        | 32 +++++++++++++++++++++++
 llvm/unittests/SandboxIR/PassTest.cpp     | 31 ++++++++++++++++++++++
 3 files changed, 67 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 5e250641f3b3f..2cc669a966e0b 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -72,6 +72,7 @@ class PassRegistry {
   DenseMap<StringRef, Pass *> NameToPassMap;
 
 public:
+  static constexpr const char PassDelimToken = ',';
   PassRegistry() = default;
   /// Registers \p PassPtr and takes ownership.
   Pass &registerPass(std::unique_ptr<Pass> &&PassPtr) {
@@ -85,6 +86,9 @@ class PassRegistry {
     auto It = NameToPassMap.find(Name);
     return It != NameToPassMap.end() ? It->second : nullptr;
   }
+  /// Creates a pass pipeline and returns the first pass manager.
+  FunctionPassManager &parseAndCreatePassPipeline(StringRef Pipeline);
+
 #ifndef NDEBUG
   void print(raw_ostream &OS) const {
     for (const auto &PassPtr : Passes)
diff --git a/llvm/lib/SandboxIR/PassManager.cpp b/llvm/lib/SandboxIR/PassManager.cpp
index 2dd19e74734db..4abd39b28e87a 100644
--- a/llvm/lib/SandboxIR/PassManager.cpp
+++ b/llvm/lib/SandboxIR/PassManager.cpp
@@ -20,6 +20,38 @@ bool FunctionPassManager::runOnFunction(Function &F) {
   // TODO: Check ChangeAll against hashes before/after.
   return Change;
 }
+
+FunctionPassManager &
+PassRegistry::parseAndCreatePassPipeline(StringRef Pipeline) {
+  static constexpr const char EndToken = '\0';
+  // Add EndToken to the end to ease parsing.
+  std::string PipelineStr = std::string(Pipeline) + EndToken;
+  int FlagBeginIdx = 0;
+  // Start with a FunctionPassManager.
+  auto &InitialPM = static_cast<FunctionPassManager &>(
+      registerPass(std::make_unique<FunctionPassManager>("init-fpm")));
+
+  for (auto [Idx, C] : enumerate(PipelineStr)) {
+    // Keep moving Idx until we find the end of the pass name.
+    bool FoundDelim = C == EndToken || C == PassDelimToken;
+    if (!FoundDelim)
+      continue;
+    unsigned Sz = Idx - FlagBeginIdx;
+    std::string PassName(&PipelineStr[FlagBeginIdx], Sz);
+    FlagBeginIdx = Idx + 1;
+
+    // Get the pass that corresponds to PassName and add it to the pass manager.
+    auto *Pass = getPassByName(PassName);
+    if (Pass == nullptr) {
+      errs() << "Pass '" << PassName << "' not registered!\n";
+      exit(1);
+    }
+    // TODO: This is safe for now, but would require proper upcasting once we
+    // add more Pass sub-classes.
+    InitialPM.addPass(static_cast<FunctionPass *>(Pass));
+  }
+  return InitialPM;
+}
 #ifndef NDEBUG
 void PassRegistry::dump() const {
   print(dbgs());
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index 3517f0e32b1bb..ed226d5765586 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -162,3 +162,34 @@ TEST_F(PassTest, PassRegistry) {
   EXPECT_EQ(Buff, "test-pass1\ntest-pass2\n");
 #endif // NDEBUG
 }
+
+TEST_F(PassTest, ParsePassPipeline) {
+  class TestPass1 final : public FunctionPass {
+  public:
+    TestPass1() : FunctionPass("test-pass1") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+  class TestPass2 final : public FunctionPass {
+  public:
+    TestPass2() : FunctionPass("test-pass2") {}
+    bool runOnFunction(Function &F) final { return false; }
+  };
+
+  PassRegistry Registry;
+  Registry.registerPass(std::make_unique<TestPass1>());
+  Registry.registerPass(std::make_unique<TestPass2>());
+
+  auto &FPM =
+      Registry.parseAndCreatePassPipeline("test-pass1,test-pass2,test-pass1");
+#ifndef NDEBUG
+  std::string Buff;
+  llvm::raw_string_ostream SS(Buff);
+  FPM.print(SS);
+  EXPECT_EQ(Buff, "init-fpm(test-pass1,test-pass2,test-pass1)");
+#endif // NDEBUG
+
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline("bad-pass-name"),
+               ".*not registered.*");
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline(""), ".*not registered.*");
+  EXPECT_DEATH(Registry.parseAndCreatePassPipeline(","), ".*not registered.*");
+}

From 8e2843b471c5efb5e5df6e0e285bfdc72dff6f17 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 13 Sep 2024 09:29:31 -0700
Subject: [PATCH 28/43] [RISCV][Docs] Change Zvbb and Zvkb from 'Assembly
 Support' to Supported. NFC (#108572)

We have generic isel support for Zvkb and Zvbb.
---
 llvm/docs/RISCVUsage.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index a15af9adfa945..cf52cd1522847 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -207,7 +207,7 @@ on support follow.
      ``Zkt``           Supported
      ``Zmmul``         Supported
      ``Ztso``          Supported
-     ``Zvbb``          Assembly Support
+     ``Zvbb``          Supported
      ``Zvbc``          Assembly Support
      ``Zve32x``        (`Partially <#riscv-vlen-32-note>`__) Supported
      ``Zve32f``        (`Partially <#riscv-vlen-32-note>`__) Supported
@@ -217,7 +217,7 @@ on support follow.
      ``Zvfbfmin``      Supported
      ``Zvfbfwma``      Supported
      ``Zvfh``          Supported
-     ``Zvkb``          Assembly Support
+     ``Zvkb``          Suppported
      ``Zvkg``          Assembly Support
      ``Zvkn``          Assembly Support
      ``Zvknc``         Assembly Support

From f902339d7f24a278b9c77d0226053075bd232ee5 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Fri, 13 Sep 2024 12:30:04 -0400
Subject: [PATCH 29/43] Adjust modulemap to mark mm3dnow as textual header.
 (#107155)

This avoids issuing the deprecation diagnostic when building the module.

Not building it into a module shouldn't cause any negative impacts,
since it no longer has any declarations other than the header guard.
It's also very rarely included by anything.

Addresses
https://github.com/llvm/llvm-project/pull/96246#issuecomment-2322453809
---
 clang/lib/Headers/module.modulemap | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index 9ffc249c8d1a2..dcaf09e8f2c55 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -66,6 +66,8 @@ module _Builtin_intrinsics [system] [extern_c] {
     textual header "__wmmintrin_aes.h"
     textual header "__wmmintrin_pclmul.h"
 
+    textual header "mm3dnow.h"
+
     explicit module mm_malloc {
       requires !freestanding
       header "mm_malloc.h"
@@ -122,10 +124,6 @@ module _Builtin_intrinsics [system] [extern_c] {
       header "popcntintrin.h"
     }
 
-    explicit module mm3dnow {
-      header "mm3dnow.h"
-    }
-
     explicit module aes_pclmul {
       header "wmmintrin.h"
       export aes

From fffc7fb7ad48d64d164565fdf54bec25267c9d22 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Fri, 13 Sep 2024 09:40:37 -0700
Subject: [PATCH 30/43] [SandboxIR] Implement DSOLocalEquivalent (#108473)

This patch implements sandboxir::DSOLocalEquivalent mirroring
llvm::DSOLocalEquivalent.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h       | 34 +++++++++++++++++++
 .../llvm/SandboxIR/SandboxIRValues.def        |  1 +
 llvm/lib/SandboxIR/SandboxIR.cpp              | 18 ++++++++++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp    | 24 +++++++++++++
 4 files changed, 77 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index 5b57d5cebc334..d21b8a85161e4 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -124,6 +124,7 @@ class ConstantAggregateZero;
 class ConstantPointerNull;
 class PoisonValue;
 class BlockAddress;
+class DSOLocalEquivalent;
 class ConstantTokenNone;
 class GlobalValue;
 class Context;
@@ -328,6 +329,7 @@ class Value {
   friend class PoisonValue;           // For `Val`.
   friend class BlockAddress;          // For `Val`.
   friend class GlobalValue;           // For `Val`.
+  friend class DSOLocalEquivalent;    // For `Val`.
 
   /// All values point to the context.
   Context &Ctx;
@@ -1218,6 +1220,38 @@ class BlockAddress final : public Constant {
   }
 };
 
+class DSOLocalEquivalent final : public Constant {
+  DSOLocalEquivalent(llvm::DSOLocalEquivalent *C, Context &Ctx)
+      : Constant(ClassID::DSOLocalEquivalent, C, Ctx) {}
+  friend class Context; // For constructor.
+
+public:
+  /// Return a DSOLocalEquivalent for the specified global value.
+  static DSOLocalEquivalent *get(GlobalValue *GV);
+
+  GlobalValue *getGlobalValue() const;
+
+  /// For isa/dyn_cast.
+  static bool classof(const sandboxir::Value *From) {
+    return From->getSubclassID() == ClassID::DSOLocalEquivalent;
+  }
+
+  unsigned getUseOperandNo(const Use &Use) const final {
+    llvm_unreachable("DSOLocalEquivalent has no operands!");
+  }
+
+#ifndef NDEBUG
+  void verify() const override {
+    assert(isa<llvm::DSOLocalEquivalent>(Val) &&
+           "Expected a DSOLocalEquivalent!");
+  }
+  void dumpOS(raw_ostream &OS) const override {
+    dumpCommonPrefix(OS);
+    dumpCommonSuffix(OS);
+  }
+#endif
+};
+
 // TODO: This should inherit from ConstantData.
 class ConstantTokenNone final : public Constant {
   ConstantTokenNone(llvm::ConstantTokenNone *C, Context &Ctx)
diff --git a/llvm/include/llvm/SandboxIR/SandboxIRValues.def b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
index 7b72f9b7173e6..c218ffee3ce38 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIRValues.def
+++ b/llvm/include/llvm/SandboxIR/SandboxIRValues.def
@@ -38,6 +38,7 @@ DEF_CONST(GlobalVariable, GlobalVariable)
 DEF_CONST(GlobalIFunc, GlobalIFunc)
 DEF_CONST(GlobalAlias, GlobalAlias)
 DEF_CONST(BlockAddress, BlockAddress)
+DEF_CONST(DSOLocalEquivalent, DSOLocalEquivalent)
 DEF_CONST(ConstantTokenNone, ConstantTokenNone)
 
 #ifndef DEF_INSTR
diff --git a/llvm/lib/SandboxIR/SandboxIR.cpp b/llvm/lib/SandboxIR/SandboxIR.cpp
index 8a7c3981e6680..04243564809db 100644
--- a/llvm/lib/SandboxIR/SandboxIR.cpp
+++ b/llvm/lib/SandboxIR/SandboxIR.cpp
@@ -2535,6 +2535,16 @@ BasicBlock *BlockAddress::getBasicBlock() const {
       Ctx.getValue(cast<llvm::BlockAddress>(Val)->getBasicBlock()));
 }
 
+DSOLocalEquivalent *DSOLocalEquivalent::get(GlobalValue *GV) {
+  auto *LLVMC = llvm::DSOLocalEquivalent::get(cast<llvm::GlobalValue>(GV->Val));
+  return cast<DSOLocalEquivalent>(GV->getContext().getValue(LLVMC));
+}
+
+GlobalValue *DSOLocalEquivalent::getGlobalValue() const {
+  return cast<GlobalValue>(
+      Ctx.getValue(cast<llvm::DSOLocalEquivalent>(Val)->getGlobalValue()));
+}
+
 ConstantTokenNone *ConstantTokenNone::get(Context &Ctx) {
   auto *LLVMC = llvm::ConstantTokenNone::get(Ctx.LLVMCtx);
   return cast<ConstantTokenNone>(Ctx.getOrCreateConstant(LLVMC));
@@ -2669,6 +2679,14 @@ Value *Context::getOrCreateValueInternal(llvm::Value *LLVMV, llvm::User *U) {
       It->second = std::unique_ptr<UndefValue>(
           new UndefValue(cast<llvm::UndefValue>(C), *this));
       return It->second.get();
+    case llvm::Value::DSOLocalEquivalentVal: {
+      auto *DSOLE = cast<llvm::DSOLocalEquivalent>(C);
+      It->second = std::unique_ptr<DSOLocalEquivalent>(
+          new DSOLocalEquivalent(DSOLE, *this));
+      auto *Ret = It->second.get();
+      getOrCreateValueInternal(DSOLE->getGlobalValue(), DSOLE);
+      return Ret;
+    }
     case llvm::Value::ConstantArrayVal:
       It->second = std::unique_ptr<ConstantArray>(
           new ConstantArray(cast<llvm::ConstantArray>(C), *this));
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index ad5508f041d6c..d883c185f8296 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -843,6 +843,30 @@ define void @foo(ptr %ptr) {
   EXPECT_EQ(LookupBB2Addr, nullptr);
 }
 
+TEST_F(SandboxIRTest, DSOLocalEquivalent) {
+  parseIR(C, R"IR(
+declare void @bar()
+define void @foo() {
+  call void dso_local_equivalent @bar()
+  ret void
+}
+)IR");
+  Function &LLVMF = *M->getFunction("foo");
+  sandboxir::Context Ctx(C);
+
+  auto &F = *Ctx.createFunction(&LLVMF);
+  auto *BB = &*F.begin();
+  auto It = BB->begin();
+  auto *CI = cast<sandboxir::CallInst>(&*It++);
+  // Check classof().
+  auto *DSOLE = cast<sandboxir::DSOLocalEquivalent>(CI->getCalledOperand());
+  // Check getGlobalValue().
+  auto *GV = DSOLE->getGlobalValue();
+  // Check get().
+  auto *NewDSOLE = sandboxir::DSOLocalEquivalent::get(GV);
+  EXPECT_EQ(NewDSOLE, DSOLE);
+}
+
 TEST_F(SandboxIRTest, ConstantTokenNone) {
   parseIR(C, R"IR(
 define void @foo(ptr %ptr) {

From 02e4186d0b3508e79d78b0ec844518b13a3fe9ea Mon Sep 17 00:00:00 2001
From: Ganesh <Ganesh.Gopalasubramanian@amd.com>
Date: Fri, 13 Sep 2024 22:15:33 +0530
Subject: [PATCH 31/43] [X86] AMD Zen 5 Initial enablement (#107964)

This patch enables the basic skeleton enablement of AMD next gen zen5 CPUs.
---
 clang/lib/Basic/Targets/X86.cpp               |   4 +
 clang/test/CodeGen/target-builtin-noerror.c   |   1 +
 clang/test/Driver/x86-march.c                 |   4 +
 clang/test/Frontend/x86-target-cpu.c          |   1 +
 clang/test/Misc/target-invalid-cpu-note/x86.c |   4 +
 .../Preprocessor/predefined-arch-macros.c     | 142 ++++++++++++++++++
 compiler-rt/lib/builtins/cpu_model/x86.c      |  20 +++
 .../llvm/TargetParser/X86TargetParser.def     |   3 +
 .../llvm/TargetParser/X86TargetParser.h       |   1 +
 llvm/lib/Target/X86/X86.td                    |  15 ++
 llvm/lib/Target/X86/X86PfmCounters.td         |   1 +
 llvm/lib/TargetParser/Host.cpp                |  19 +++
 llvm/lib/TargetParser/X86TargetParser.cpp     |   5 +
 .../CodeGen/X86/bypass-slow-division-64.ll    |   1 +
 llvm/test/CodeGen/X86/cmp16.ll                |   1 +
 llvm/test/CodeGen/X86/cpus-amd.ll             |   1 +
 llvm/test/CodeGen/X86/rdpru.ll                |   1 +
 llvm/test/CodeGen/X86/shuffle-as-shifts.ll    |   1 +
 llvm/test/CodeGen/X86/slow-unaligned-mem.ll   |   1 +
 llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll   |   1 +
 .../X86/tuning-shuffle-permilpd-avx512.ll     |   1 +
 .../X86/tuning-shuffle-permilps-avx512.ll     |   1 +
 .../X86/tuning-shuffle-unpckpd-avx512.ll      |   1 +
 .../X86/tuning-shuffle-unpckps-avx512.ll      |   1 +
 .../X86/vector-shuffle-fast-per-lane.ll       |   1 +
 llvm/test/CodeGen/X86/vpdpwssd.ll             |   1 +
 .../CodeGen/X86/x86-64-double-shifts-var.ll   |   1 +
 llvm/test/MC/X86/x86_long_nop.s               |   2 +
 .../Transforms/LoopUnroll/X86/call-remark.ll  |   1 +
 .../Transforms/SLPVectorizer/X86/pr63668.ll   |   1 +
 30 files changed, 238 insertions(+)

diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp
index 62c382b67ad14..5448bd841959f 100644
--- a/clang/lib/Basic/Targets/X86.cpp
+++ b/clang/lib/Basic/Targets/X86.cpp
@@ -728,6 +728,9 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts,
   case CK_ZNVER4:
     defineCPUMacros(Builder, "znver4");
     break;
+  case CK_ZNVER5:
+    defineCPUMacros(Builder, "znver5");
+    break;
   case CK_Geode:
     defineCPUMacros(Builder, "geode");
     break;
@@ -1626,6 +1629,7 @@ std::optional<unsigned> X86TargetInfo::getCPUCacheLineSize() const {
     case CK_ZNVER2:
     case CK_ZNVER3:
     case CK_ZNVER4:
+    case CK_ZNVER5:
     // Deprecated
     case CK_x86_64:
     case CK_x86_64_v2:
diff --git a/clang/test/CodeGen/target-builtin-noerror.c b/clang/test/CodeGen/target-builtin-noerror.c
index 14024e3953182..2a05074d7c2b6 100644
--- a/clang/test/CodeGen/target-builtin-noerror.c
+++ b/clang/test/CodeGen/target-builtin-noerror.c
@@ -207,4 +207,5 @@ void verifycpustrings(void) {
   (void)__builtin_cpu_is("znver2");
   (void)__builtin_cpu_is("znver3");
   (void)__builtin_cpu_is("znver4");
+  (void)__builtin_cpu_is("znver5");
 }
diff --git a/clang/test/Driver/x86-march.c b/clang/test/Driver/x86-march.c
index cc993b53937c1..3bc2a82ae778d 100644
--- a/clang/test/Driver/x86-march.c
+++ b/clang/test/Driver/x86-march.c
@@ -242,6 +242,10 @@
 // RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=znver4 2>&1 \
 // RUN:   | FileCheck %s -check-prefix=znver4
 // znver4: "-target-cpu" "znver4"
+//
+// RUN: %clang -target x86_64-unknown-unknown -c -### %s -march=znver5 2>&1 \
+// RUN:   | FileCheck %s -check-prefix=znver5
+// znver5: "-target-cpu" "znver5"
 
 // RUN: %clang -target x86_64 -c -### %s -march=x86-64 2>&1 | FileCheck %s --check-prefix=x86-64
 // x86-64: "-target-cpu" "x86-64"
diff --git a/clang/test/Frontend/x86-target-cpu.c b/clang/test/Frontend/x86-target-cpu.c
index 6c8502ac2c21e..f2885a040c370 100644
--- a/clang/test/Frontend/x86-target-cpu.c
+++ b/clang/test/Frontend/x86-target-cpu.c
@@ -38,5 +38,6 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver2 -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver3 -verify %s
 // RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver4 -verify %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -target-cpu znver5 -verify %s
 //
 // expected-no-diagnostics
diff --git a/clang/test/Misc/target-invalid-cpu-note/x86.c b/clang/test/Misc/target-invalid-cpu-note/x86.c
index 607192a5409ba..7879676040af4 100644
--- a/clang/test/Misc/target-invalid-cpu-note/x86.c
+++ b/clang/test/Misc/target-invalid-cpu-note/x86.c
@@ -99,6 +99,7 @@
 // X86-SAME: {{^}}, znver2
 // X86-SAME: {{^}}, znver3
 // X86-SAME: {{^}}, znver4
+// X86-SAME: {{^}}, znver5
 // X86-SAME: {{^}}, x86-64
 // X86-SAME: {{^}}, x86-64-v2
 // X86-SAME: {{^}}, x86-64-v3
@@ -175,6 +176,7 @@
 // X86_64-SAME: {{^}}, znver2
 // X86_64-SAME: {{^}}, znver3
 // X86_64-SAME: {{^}}, znver4
+// X86_64-SAME: {{^}}, znver5
 // X86_64-SAME: {{^}}, x86-64
 // X86_64-SAME: {{^}}, x86-64-v2
 // X86_64-SAME: {{^}}, x86-64-v3
@@ -278,6 +280,7 @@
 // TUNE_X86-SAME: {{^}}, znver2
 // TUNE_X86-SAME: {{^}}, znver3
 // TUNE_X86-SAME: {{^}}, znver4
+// TUNE_X86-SAME: {{^}}, znver5
 // TUNE_X86-SAME: {{^}}, x86-64
 // TUNE_X86-SAME: {{^}}, geode
 // TUNE_X86-SAME: {{$}}
@@ -379,6 +382,7 @@
 // TUNE_X86_64-SAME: {{^}}, znver2
 // TUNE_X86_64-SAME: {{^}}, znver3
 // TUNE_X86_64-SAME: {{^}}, znver4
+// TUNE_X86_64-SAME: {{^}}, znver5
 // TUNE_X86_64-SAME: {{^}}, x86-64
 // TUNE_X86_64-SAME: {{^}}, geode
 // TUNE_X86_64-SAME: {{$}}
diff --git a/clang/test/Preprocessor/predefined-arch-macros.c b/clang/test/Preprocessor/predefined-arch-macros.c
index 49646d94d920c..a149c69ee0cdb 100644
--- a/clang/test/Preprocessor/predefined-arch-macros.c
+++ b/clang/test/Preprocessor/predefined-arch-macros.c
@@ -3923,6 +3923,148 @@
 // CHECK_ZNVER4_M64: #define __znver4 1
 // CHECK_ZNVER4_M64: #define __znver4__ 1
 
+// RUN: %clang -march=znver5 -m32 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER5_M32
+// CHECK_ZNVER5_M32-NOT: #define __3dNOW_A__ 1
+// CHECK_ZNVER5_M32-NOT: #define __3dNOW__ 1
+// CHECK_ZNVER5_M32: #define __ADX__ 1
+// CHECK_ZNVER5_M32: #define __AES__ 1
+// CHECK_ZNVER5_M32: #define __AVX2__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BF16__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BITALG__ 1
+// CHECK_ZNVER5_M32: #define __AVX512BW__ 1
+// CHECK_ZNVER5_M32: #define __AVX512CD__ 1
+// CHECK_ZNVER5_M32: #define __AVX512DQ__ 1
+// CHECK_ZNVER5_M32: #define __AVX512F__ 1
+// CHECK_ZNVER5_M32: #define __AVX512IFMA__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VBMI2__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VBMI__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VL__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VNNI__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VP2INTERSECT__ 1
+// CHECK_ZNVER5_M32: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_ZNVER5_M32: #define __AVXVNNI__ 1
+// CHECK_ZNVER5_M32: #define __AVX__ 1
+// CHECK_ZNVER5_M32: #define __BMI2__ 1
+// CHECK_ZNVER5_M32: #define __BMI__ 1
+// CHECK_ZNVER5_M32: #define __CLFLUSHOPT__ 1
+// CHECK_ZNVER5_M32: #define __CLWB__ 1
+// CHECK_ZNVER5_M32: #define __CLZERO__ 1
+// CHECK_ZNVER5_M32: #define __F16C__ 1
+// CHECK_ZNVER5_M32-NOT: #define __FMA4__ 1
+// CHECK_ZNVER5_M32: #define __FMA__ 1
+// CHECK_ZNVER5_M32: #define __FSGSBASE__ 1
+// CHECK_ZNVER5_M32: #define __GFNI__ 1
+// CHECK_ZNVER5_M32: #define __LZCNT__ 1
+// CHECK_ZNVER5_M32: #define __MMX__ 1
+// CHECK_ZNVER5_M32: #define __MOVDIR64B__ 1
+// CHECK_ZNVER5_M32: #define __MOVDIRI__ 1
+// CHECK_ZNVER5_M32: #define __PCLMUL__ 1
+// CHECK_ZNVER5_M32: #define __PKU__ 1
+// CHECK_ZNVER5_M32: #define __POPCNT__ 1
+// CHECK_ZNVER5_M32: #define __PREFETCHI__ 1
+// CHECK_ZNVER5_M32: #define __PRFCHW__ 1
+// CHECK_ZNVER5_M32: #define __RDPID__ 1
+// CHECK_ZNVER5_M32: #define __RDPRU__ 1
+// CHECK_ZNVER5_M32: #define __RDRND__ 1
+// CHECK_ZNVER5_M32: #define __RDSEED__ 1
+// CHECK_ZNVER5_M32: #define __SHA__ 1
+// CHECK_ZNVER5_M32: #define __SSE2_MATH__ 1
+// CHECK_ZNVER5_M32: #define __SSE2__ 1
+// CHECK_ZNVER5_M32: #define __SSE3__ 1
+// CHECK_ZNVER5_M32: #define __SSE4A__ 1
+// CHECK_ZNVER5_M32: #define __SSE4_1__ 1
+// CHECK_ZNVER5_M32: #define __SSE4_2__ 1
+// CHECK_ZNVER5_M32: #define __SSE_MATH__ 1
+// CHECK_ZNVER5_M32: #define __SSE__ 1
+// CHECK_ZNVER5_M32: #define __SSSE3__ 1
+// CHECK_ZNVER5_M32-NOT: #define __TBM__ 1
+// CHECK_ZNVER5_M32: #define __WBNOINVD__ 1
+// CHECK_ZNVER5_M32-NOT: #define __XOP__ 1
+// CHECK_ZNVER5_M32: #define __XSAVEC__ 1
+// CHECK_ZNVER5_M32: #define __XSAVEOPT__ 1
+// CHECK_ZNVER5_M32: #define __XSAVES__ 1
+// CHECK_ZNVER5_M32: #define __XSAVE__ 1
+// CHECK_ZNVER5_M32: #define __i386 1
+// CHECK_ZNVER5_M32: #define __i386__ 1
+// CHECK_ZNVER5_M32: #define __tune_znver5__ 1
+// CHECK_ZNVER5_M32: #define __znver5 1
+// CHECK_ZNVER5_M32: #define __znver5__ 1
+
+// RUN: %clang -march=znver5 -m64 -E -dM %s -o - 2>&1 \
+// RUN:     -target i386-unknown-linux \
+// RUN:   | FileCheck -match-full-lines %s -check-prefix=CHECK_ZNVER5_M64
+// CHECK_ZNVER5_M64-NOT: #define __3dNOW_A__ 1
+// CHECK_ZNVER5_M64-NOT: #define __3dNOW__ 1
+// CHECK_ZNVER5_M64: #define __ADX__ 1
+// CHECK_ZNVER5_M64: #define __AES__ 1
+// CHECK_ZNVER5_M64: #define __AVX2__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BF16__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BITALG__ 1
+// CHECK_ZNVER5_M64: #define __AVX512BW__ 1
+// CHECK_ZNVER5_M64: #define __AVX512CD__ 1
+// CHECK_ZNVER5_M64: #define __AVX512DQ__ 1
+// CHECK_ZNVER5_M64: #define __AVX512F__ 1
+// CHECK_ZNVER5_M64: #define __AVX512IFMA__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VBMI2__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VBMI__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VL__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VNNI__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VP2INTERSECT__ 1
+// CHECK_ZNVER5_M64: #define __AVX512VPOPCNTDQ__ 1
+// CHECK_ZNVER5_M64: #define __AVXVNNI__ 1
+// CHECK_ZNVER5_M64: #define __AVX__ 1
+// CHECK_ZNVER5_M64: #define __BMI2__ 1
+// CHECK_ZNVER5_M64: #define __BMI__ 1
+// CHECK_ZNVER5_M64: #define __CLFLUSHOPT__ 1
+// CHECK_ZNVER5_M64: #define __CLWB__ 1
+// CHECK_ZNVER5_M64: #define __CLZERO__ 1
+// CHECK_ZNVER5_M64: #define __F16C__ 1
+// CHECK_ZNVER5_M64-NOT: #define __FMA4__ 1
+// CHECK_ZNVER5_M64: #define __FMA__ 1
+// CHECK_ZNVER5_M64: #define __FSGSBASE__ 1
+// CHECK_ZNVER5_M64: #define __GFNI__ 1
+// CHECK_ZNVER5_M64: #define __LZCNT__ 1
+// CHECK_ZNVER5_M64: #define __MMX__ 1
+// CHECK_ZNVER5_M64: #define __MOVDIR64B__ 1
+// CHECK_ZNVER5_M64: #define __MOVDIRI__ 1
+// CHECK_ZNVER5_M64: #define __PCLMUL__ 1
+// CHECK_ZNVER5_M64: #define __PKU__ 1
+// CHECK_ZNVER5_M64: #define __POPCNT__ 1
+// CHECK_ZNVER5_M64: #define __PREFETCHI__ 1
+// CHECK_ZNVER5_M64: #define __PRFCHW__ 1
+// CHECK_ZNVER5_M64: #define __RDPID__ 1
+// CHECK_ZNVER5_M64: #define __RDPRU__ 1
+// CHECK_ZNVER5_M64: #define __RDRND__ 1
+// CHECK_ZNVER5_M64: #define __RDSEED__ 1
+// CHECK_ZNVER5_M64: #define __SHA__ 1
+// CHECK_ZNVER5_M64: #define __SSE2_MATH__ 1
+// CHECK_ZNVER5_M64: #define __SSE2__ 1
+// CHECK_ZNVER5_M64: #define __SSE3__ 1
+// CHECK_ZNVER5_M64: #define __SSE4A__ 1
+// CHECK_ZNVER5_M64: #define __SSE4_1__ 1
+// CHECK_ZNVER5_M64: #define __SSE4_2__ 1
+// CHECK_ZNVER5_M64: #define __SSE_MATH__ 1
+// CHECK_ZNVER5_M64: #define __SSE__ 1
+// CHECK_ZNVER5_M64: #define __SSSE3__ 1
+// CHECK_ZNVER5_M64-NOT: #define __TBM__ 1
+// CHECK_ZNVER5_M64: #define __VAES__ 1
+// CHECK_ZNVER5_M64: #define __VPCLMULQDQ__ 1
+// CHECK_ZNVER5_M64: #define __WBNOINVD__ 1
+// CHECK_ZNVER5_M64-NOT: #define __XOP__ 1
+// CHECK_ZNVER5_M64: #define __XSAVEC__ 1
+// CHECK_ZNVER5_M64: #define __XSAVEOPT__ 1
+// CHECK_ZNVER5_M64: #define __XSAVES__ 1
+// CHECK_ZNVER5_M64: #define __XSAVE__ 1
+// CHECK_ZNVER5_M64: #define __amd64 1
+// CHECK_ZNVER5_M64: #define __amd64__ 1
+// CHECK_ZNVER5_M64: #define __tune_znver5__ 1
+// CHECK_ZNVER5_M64: #define __x86_64 1
+// CHECK_ZNVER5_M64: #define __x86_64__ 1
+// CHECK_ZNVER5_M64: #define __znver5 1
+// CHECK_ZNVER5_M64: #define __znver5__ 1
+
 // End X86/GCC/Linux tests ------------------
 
 // Begin PPC/GCC/Linux tests ----------------
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 069defc970190..dbe6094541f63 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -63,6 +63,7 @@ enum ProcessorTypes {
   INTEL_SIERRAFOREST,
   INTEL_GRANDRIDGE,
   INTEL_CLEARWATERFOREST,
+  AMDFAM1AH,
   CPU_TYPE_MAX
 };
 
@@ -101,6 +102,7 @@ enum ProcessorSubtypes {
   INTEL_COREI7_ARROWLAKE,
   INTEL_COREI7_ARROWLAKE_S,
   INTEL_COREI7_PANTHERLAKE,
+  AMDFAM1AH_ZNVER5,
   CPU_SUBTYPE_MAX
 };
 
@@ -748,6 +750,24 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       break; //  "znver4"
     }
     break; // family 19h
+  case 26:
+    CPU = "znver5";
+    *Type = AMDFAM1AH;
+    if (Model <= 0x77) {
+      // Models 00h-0Fh (Breithorn).
+      // Models 10h-1Fh (Breithorn-Dense).
+      // Models 20h-2Fh (Strix 1).
+      // Models 30h-37h (Strix 2).
+      // Models 38h-3Fh (Strix 3).
+      // Models 40h-4Fh (Granite Ridge).
+      // Models 50h-5Fh (Weisshorn).
+      // Models 60h-6Fh (Krackan1).
+      // Models 70h-77h (Sarlak).
+      CPU = "znver5";
+      *Subtype = AMDFAM1AH_ZNVER5;
+      break; //  "znver5"
+    }
+    break;
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.def b/llvm/include/llvm/TargetParser/X86TargetParser.def
index cd160f54e6670..e5bf196559ba6 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.def
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.def
@@ -49,11 +49,13 @@ X86_CPU_TYPE(ZHAOXIN_FAM7H,       "zhaoxin_fam7h")
 X86_CPU_TYPE(INTEL_SIERRAFOREST,  "sierraforest")
 X86_CPU_TYPE(INTEL_GRANDRIDGE,    "grandridge")
 X86_CPU_TYPE(INTEL_CLEARWATERFOREST, "clearwaterforest")
+X86_CPU_TYPE(AMDFAM1AH,           "amdfam1ah")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_TYPE_ALIAS(INTEL_BONNELL,    "atom")
 X86_CPU_TYPE_ALIAS(AMDFAM10H,        "amdfam10")
 X86_CPU_TYPE_ALIAS(AMDFAM15H,        "amdfam15")
+X86_CPU_TYPE_ALIAS(AMDFAM1AH,        "amdfam1a")
 X86_CPU_TYPE_ALIAS(INTEL_SILVERMONT, "slm")
 
 #undef X86_CPU_TYPE_ALIAS
@@ -104,6 +106,7 @@ X86_CPU_SUBTYPE(INTEL_COREI7_GRANITERAPIDS_D,"graniterapids-d")
 X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE,      "arrowlake")
 X86_CPU_SUBTYPE(INTEL_COREI7_ARROWLAKE_S,    "arrowlake-s")
 X86_CPU_SUBTYPE(INTEL_COREI7_PANTHERLAKE,    "pantherlake")
+X86_CPU_SUBTYPE(AMDFAM1AH_ZNVER5,            "znver5")
 
 // Alternate names supported by __builtin_cpu_is and target multiversioning.
 X86_CPU_SUBTYPE_ALIAS(INTEL_COREI7_ALDERLAKE, "raptorlake")
diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h
index 2083e585af4ac..0e17c4674719c 100644
--- a/llvm/include/llvm/TargetParser/X86TargetParser.h
+++ b/llvm/include/llvm/TargetParser/X86TargetParser.h
@@ -142,6 +142,7 @@ enum CPUKind {
   CK_ZNVER2,
   CK_ZNVER3,
   CK_ZNVER4,
+  CK_ZNVER5,
   CK_x86_64,
   CK_x86_64_v2,
   CK_x86_64_v3,
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index 988966fa6a6c4..6cf37836f921d 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -1549,6 +1549,19 @@ def ProcessorFeatures {
                                                   FeatureVPOPCNTDQ];
   list<SubtargetFeature> ZN4Features =
     !listconcat(ZN3Features, ZN4AdditionalFeatures);
+
+
+  list<SubtargetFeature> ZN5Tuning = ZN4Tuning;
+  list<SubtargetFeature> ZN5AdditionalFeatures = [FeatureVNNI,
+                                                  FeatureMOVDIRI,
+                                                  FeatureMOVDIR64B,
+                                                  FeatureVP2INTERSECT,
+                                                  FeaturePREFETCHI,
+                                                  FeatureAVXVNNI
+                                                  ];
+  list<SubtargetFeature> ZN5Features =
+    !listconcat(ZN4Features, ZN5AdditionalFeatures);
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -1898,6 +1911,8 @@ def : ProcModel<"znver3", Znver3Model, ProcessorFeatures.ZN3Features,
                 ProcessorFeatures.ZN3Tuning>;
 def : ProcModel<"znver4", Znver4Model, ProcessorFeatures.ZN4Features,
            ProcessorFeatures.ZN4Tuning>;
+def : ProcModel<"znver5", Znver4Model, ProcessorFeatures.ZN5Features,
+                ProcessorFeatures.ZN5Tuning>;
 
 def : Proc<"geode",           [FeatureX87, FeatureCX8, FeatureMMX, FeaturePRFCHW],
                               [TuningSlowUAMem16, TuningInsertVZEROUPPER]>;
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 2b1dac411c992..c30e989cdc2af 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -350,3 +350,4 @@ def ZnVer4PfmCounters : ProcPfmCounters {
   let ValidationCounters = DefaultAMDPfmValidationCounters;
 }
 def : PfmCountersBinding<"znver4", ZnVer4PfmCounters>;
+def : PfmCountersBinding<"znver5", ZnVer4PfmCounters>;
diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp
index 986b9a211ce6c..b2c4f9ee00293 100644
--- a/llvm/lib/TargetParser/Host.cpp
+++ b/llvm/lib/TargetParser/Host.cpp
@@ -1151,6 +1151,25 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       break; //  "znver4"
     }
     break; // family 19h
+  case 26:
+    CPU = "znver5";
+    *Type = X86::AMDFAM1AH;
+    if (Model <= 0x77) {
+      // Models 00h-0Fh (Breithorn).
+      // Models 10h-1Fh (Breithorn-Dense).
+      // Models 20h-2Fh (Strix 1).
+      // Models 30h-37h (Strix 2).
+      // Models 38h-3Fh (Strix 3).
+      // Models 40h-4Fh (Granite Ridge).
+      // Models 50h-5Fh (Weisshorn).
+      // Models 60h-6Fh (Krackan1).
+      // Models 70h-77h (Sarlak).
+      CPU = "znver5";
+      *Subtype = X86::AMDFAM1AH_ZNVER5;
+      break; //  "znver5"
+    }
+    break;
+
   default:
     break; // Unknown AMD CPU.
   }
diff --git a/llvm/lib/TargetParser/X86TargetParser.cpp b/llvm/lib/TargetParser/X86TargetParser.cpp
index 57bda0651ea82..09d4312918acf 100644
--- a/llvm/lib/TargetParser/X86TargetParser.cpp
+++ b/llvm/lib/TargetParser/X86TargetParser.cpp
@@ -238,6 +238,10 @@ static constexpr FeatureBitset FeaturesZNVER4 =
     FeatureAVX512BITALG | FeatureAVX512VPOPCNTDQ | FeatureAVX512BF16 |
     FeatureGFNI | FeatureSHSTK;
 
+static constexpr FeatureBitset FeaturesZNVER5 =
+    FeaturesZNVER4 | FeatureAVXVNNI | FeatureMOVDIRI | FeatureMOVDIR64B |
+    FeatureAVX512VP2INTERSECT | FeaturePREFETCHI | FeatureAVXVNNI;
+
 // D151696 tranplanted Mangling and OnlyForCPUDispatchSpecific from
 // X86TargetParser.def to here. They are assigned by following ways:
 // 1. Copy the mangling from the original CPU_SPEICIFC MACROs. If no, assign
@@ -417,6 +421,7 @@ constexpr ProcInfo Processors[] = {
   { {"znver2"}, CK_ZNVER2, FEATURE_AVX2, FeaturesZNVER2, '\0', false },
   { {"znver3"}, CK_ZNVER3, FEATURE_AVX2, FeaturesZNVER3, '\0', false },
   { {"znver4"}, CK_ZNVER4, FEATURE_AVX512VBMI2, FeaturesZNVER4, '\0', false },
+  { {"znver5"}, CK_ZNVER5, FEATURE_AVX512VP2INTERSECT, FeaturesZNVER5, '\0', false },
   // Generic 64-bit processor.
   { {"x86-64"}, CK_x86_64, FEATURE_SSE2 , FeaturesX86_64, '\0', false },
   { {"x86-64-v2"}, CK_x86_64_v2, FEATURE_SSE4_2 , FeaturesX86_64_V2, '\0', false },
diff --git a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
index 6e0cfdd26a786..b0ca0069a526b 100644
--- a/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
+++ b/llvm/test/CodeGen/X86/bypass-slow-division-64.ll
@@ -23,6 +23,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5          | FileCheck %s --check-prefixes=CHECK,SLOW-DIVQ
 
 ; Additional tests for 64-bit divide bypass
 
diff --git a/llvm/test/CodeGen/X86/cmp16.ll b/llvm/test/CodeGen/X86/cmp16.ll
index fa9e75ff16a5c..8c14a78d9e113 100644
--- a/llvm/test/CodeGen/X86/cmp16.ll
+++ b/llvm/test/CodeGen/X86/cmp16.ll
@@ -13,6 +13,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s --check-prefixes=X64,X64-FAST
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s --check-prefixes=X64,X64-FAST
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s --check-prefixes=X64,X64-FAST
 
 define i1 @cmp16_reg_eq_reg(i16 %a0, i16 %a1) {
 ; X86-GENERIC-LABEL: cmp16_reg_eq_reg:
diff --git a/llvm/test/CodeGen/X86/cpus-amd.ll b/llvm/test/CodeGen/X86/cpus-amd.ll
index 228a00428c457..33b2cf3731478 100644
--- a/llvm/test/CodeGen/X86/cpus-amd.ll
+++ b/llvm/test/CodeGen/X86/cpus-amd.ll
@@ -29,6 +29,7 @@
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver2 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver3 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 ; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver4 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
+; RUN: llc < %s -o /dev/null -mtriple=x86_64-unknown-unknown -mcpu=znver5 2>&1 | FileCheck %s --check-prefix=CHECK-NO-ERROR --allow-empty
 
 define void @foo() {
   ret void
diff --git a/llvm/test/CodeGen/X86/rdpru.ll b/llvm/test/CodeGen/X86/rdpru.ll
index 7771f52653cb5..be79a4499a338 100644
--- a/llvm/test/CodeGen/X86/rdpru.ll
+++ b/llvm/test/CodeGen/X86/rdpru.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 -fast-isel | FileCheck %s --check-prefix=X64
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 -fast-isel | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 -fast-isel | FileCheck %s --check-prefix=X64
 
 define void @rdpru_asm() {
 ; X86-LABEL: rdpru_asm:
diff --git a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
index e89197f5b42c3..9c8729b3ea505 100644
--- a/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
+++ b/llvm/test/CodeGen/X86/shuffle-as-shifts.ll
@@ -3,6 +3,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=icelake-server  | FileCheck %s --check-prefixes=CHECK,CHECK-ICX
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 
 define <4 x i32> @shuf_rot_v4i32_1032(<4 x i32> %x) {
diff --git a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
index d74d195439bda..ceef3fb4bb188 100644
--- a/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
+++ b/llvm/test/CodeGen/X86/slow-unaligned-mem.ll
@@ -50,6 +50,7 @@
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver2        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver3        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX256
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver4        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
+; RUN: llc < %s -mtriple=i386-unknown-unknown -mcpu=znver5        2>&1 | FileCheck %s --check-prefixes=FAST,FAST-AVX512
 
 ; Other chips with slow unaligned memory accesses
 
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 9f2071ff14b87..2b78a70ebcc26 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -6,6 +6,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver1      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5      | FileCheck %s --check-prefixes=FAST-SCALAR,FAST-VECTOR
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64      | FileCheck %s --check-prefixes=X86-64
 
 define float @f32_no_daz(float %f) #0 {
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
index 7d8bb567c09b3..162ab71fc00d4 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilpd-avx512.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <8 x double> @transform_VPERMILPSZrr(<8 x double> %a) nounwind {
 ; CHECK-LABEL: transform_VPERMILPSZrr:
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
index 5d031f6017c77..cd97946da248f 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-permilps-avx512.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <16 x float> @transform_VPERMILPSZrr(<16 x float> %a) nounwind {
 ; CHECK-LABEL: transform_VPERMILPSZrr:
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
index 4a160bc9debc7..5ea991f85523e 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckpd-avx512.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 
 define <16 x float> @transform_VUNPCKLPDZrr(<16 x float> %a, <16 x float> %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll b/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
index d0e3ad9b19086..96155f0300d2d 100644
--- a/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
+++ b/llvm/test/CodeGen/X86/tuning-shuffle-unpckps-avx512.ll
@@ -5,6 +5,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4  | FileCheck %s --check-prefixes=CHECK,CHECK-V4
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw,+avx512dq  | FileCheck %s --check-prefixes=CHECK,CHECK-AVX512
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5  | FileCheck %s --check-prefixes=CHECK,CHECK-ZNVER4
 
 define <16 x float> @transform_VUNPCKLPSZrr(<16 x float> %a, <16 x float> %b) nounwind {
 ; CHECK-LABEL: transform_VUNPCKLPSZrr:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll b/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
index e59532d4fef30..4021b1bf292bb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-fast-per-lane.ll
@@ -8,6 +8,7 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver2 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver3 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver4 | FileCheck %s --check-prefixes=FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=znver5 | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell | FileCheck %s --check-prefixes=FAST
 ; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefixes=FAST
 
diff --git a/llvm/test/CodeGen/X86/vpdpwssd.ll b/llvm/test/CodeGen/X86/vpdpwssd.ll
index e6a07b4aeb271..3c1eb92e9e3c3 100644
--- a/llvm/test/CodeGen/X86/vpdpwssd.ll
+++ b/llvm/test/CodeGen/X86/vpdpwssd.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=znver5 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vnni,+fast-dpwssd | FileCheck %s
 
 define <16 x i32> @vpdpwssd_test(<16 x i32> %0, <16 x i32> %1, <16 x i32> %2) {
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
index af6fbdc9f60de..bbaa414924707 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -16,6 +16,7 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver2 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver3 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=znver5 | FileCheck %s
 
 ; Verify that for the X86_64 processors that are known to have poor latency
 ; double precision shift instructions we do not generate 'shld' or 'shrd'
diff --git a/llvm/test/MC/X86/x86_long_nop.s b/llvm/test/MC/X86/x86_long_nop.s
index 6136c3db9a3da..b79403bb5f1ec 100644
--- a/llvm/test/MC/X86/x86_long_nop.s
+++ b/llvm/test/MC/X86/x86_long_nop.s
@@ -19,6 +19,8 @@
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver3 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver4 %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver4 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=x86_64-pc-linux-gnu -mcpu=znver5 %s | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
+# RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu %s -mcpu=znver5 | llvm-objdump -d --no-show-raw-insn - | FileCheck %s --check-prefix=LNOP15
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=nehalem %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=westmere %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP10 %s
 # RUN: llvm-mc -filetype=obj -arch=x86 -triple=i686-pc-linux-gnu -mcpu=sandybridge %s | llvm-objdump -d --no-show-raw-insn - | FileCheck --check-prefix=LNOP15 %s
diff --git a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
index abdcfcf7e0742..b05994ddfa35e 100644
--- a/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
+++ b/llvm/test/Transforms/LoopUnroll/X86/call-remark.ll
@@ -1,6 +1,7 @@
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=TTI -pass-remarks-analysis=TTI  < %s -S 2>&1 | FileCheck --check-prefixes=ALL,TTI %s
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver4 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
+; RUN: opt -passes=debugify,loop-unroll -mcpu=znver5 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 | FileCheck --check-prefixes=ALL,UNROLL %s
 
 ; RUN: opt -passes=debugify,loop-unroll -mcpu=znver3 -pass-remarks=loop-unroll -pass-remarks-analysis=loop-unroll < %s -S 2>&1 --try-experimental-debuginfo-iterators | FileCheck --check-prefixes=ALL,UNROLL %s
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
index 391771e06cab8..037e073de9d59 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr63668.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
 ; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver4 -S < %s | FileCheck %s
+; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -mcpu=znver5 -S < %s | FileCheck %s
 
 define internal i32 @testfunc() {
 ; CHECK-LABEL: define internal i32 @testfunc

From b9d85b1263efa8c4953f8cf10999ee165f32922e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Sep 2024 10:04:33 -0700
Subject: [PATCH 32/43] [CodeGen] Use DenseMap::operator[] (NFC) (#108489)

Once we modernize CopyInfo with default member initializations,

  Copies.insert({Unit, ...})

becomes equivalent to:

  Copies.try_emplace(Unit)

which we can simplify further down to Copies[Unit].
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index fab36f4858e09..8bcc437cbfb86 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -108,9 +108,10 @@ static std::optional<DestSourcePair> isCopyInstr(const MachineInstr &MI,
 
 class CopyTracker {
   struct CopyInfo {
-    MachineInstr *MI, *LastSeenUseInCopy;
+    MachineInstr *MI = nullptr;
+    MachineInstr *LastSeenUseInCopy = nullptr;
     SmallVector<MCRegister, 4> DefRegs;
-    bool Avail;
+    bool Avail = false;
   };
 
   DenseMap<MCRegUnit, CopyInfo> Copies;
@@ -240,8 +241,7 @@ class CopyTracker {
     // Remember source that's copied to Def. Once it's clobbered, then
     // it's no longer available for copy propagation.
     for (MCRegUnit Unit : TRI.regunits(Src)) {
-      auto I = Copies.insert({Unit, {nullptr, nullptr, {}, false}});
-      auto &Copy = I.first->second;
+      auto &Copy = Copies[Unit];
       if (!is_contained(Copy.DefRegs, Def))
         Copy.DefRegs.push_back(Def);
       Copy.LastSeenUseInCopy = MI;

From 4ef16e3160750717f447fd76e2383c10b68eb5ae Mon Sep 17 00:00:00 2001
From: Julian Schmidt <git.julian.schmidt@gmail.com>
Date: Fri, 13 Sep 2024 19:05:23 +0200
Subject: [PATCH 33/43] [NFC][clang-tidy] document fix to
 bugprone-return-const-ref-from-parameter (#107641)

Describe how the issue that is diagnosed by this check can be resolved.
Namely, by adding an overload for the xvalue case (`&&` parameter).

Fixes #107600
---
 .../checks/bugprone/return-const-ref-from-parameter.rst  | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
index f007dfe549990..2349e51477b7d 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/return-const-ref-from-parameter.rst
@@ -12,6 +12,15 @@ after the call. When the function returns such a parameter also as constant refe
 then the returned reference can be used after the object it refers to has been
 destroyed.
 
+This issue can be resolved by declaring an overload of the problematic function
+where the ``const &`` parameter is instead declared as ``&&``. The developer has
+to ensure that the implementation of that function does not produce a
+use-after-free, the exact error that this check is warning against.
+Marking such an ``&&`` overload as ``deleted``, will silence the warning as 
+well. In the case of different ``const &`` parameters being returned depending
+on the control flow of the function, an overload where all problematic
+``const &`` parameters have been declared as ``&&`` will resolve the issue.
+
 Example
 -------
 

From a953982cb7dee0678bb5f7c2febe4c3b8b718c7a Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Fri, 13 Sep 2024 12:07:51 -0500
Subject: [PATCH 34/43] [mlir][GPU] Plumb range information through the NVVM
 lowerings (#107659)

Update the GPU to NVVM lowerings to correctly propagate range
information on IDs and dimension queries, etiher from
known_{block,grid}_size attributes or from `upperBound` annotations on
the operations themselves.
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 71 +++++++++++--------
 .../GPUToNVVM/LowerGpuOpsToNVVMOps.cpp        | 54 +++++++++-----
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  1 +
 .../Conversion/GPUToNVVM/gpu-to-nvvm.mlir     | 34 +++++++--
 mlir/test/Target/LLVMIR/Import/nvvmir.ll      |  3 +
 mlir/test/Target/LLVMIR/nvvmir.mlir           |  5 +-
 6 files changed, 119 insertions(+), 49 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7bbf18fe0106f..152715f281088 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -123,52 +123,67 @@ class NVVM_SpecialRegisterOp<string mnemonic, list<Trait> traits = []> :
   let assemblyFormat = "attr-dict `:` type($res)";
 }
 
+class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []> :
+  NVVM_SpecialRegisterOp<mnemonic, traits> {
+  let arguments = (ins OptionalAttr<LLVM_ConstantRangeAttr>:$range);
+  let assemblyFormat = "(`range` $range^)? attr-dict `:` type($res)";
+  let llvmBuilder = baseLlvmBuilder # setRangeRetAttrCode # baseLlvmBuilderCoda;
+  let mlirBuilder = baseMlirBuilder # importRangeRetAttrCode # baseMlirBuilderCoda;
+
+  // Backwards-compatibility builder for an unspecified range.
+  let builders = [
+    OpBuilder<(ins "Type":$resultType), [{
+      build($_builder, $_state, resultType, ::mlir::LLVM::ConstantRangeAttr{});
+    }]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Lane index and range
-def NVVM_LaneIdOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.laneid">;
-def NVVM_WarpSizeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
+def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
 
 //===----------------------------------------------------------------------===//
 // Thread index and range
-def NVVM_ThreadIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.x">;
-def NVVM_ThreadIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.y">;
-def NVVM_ThreadIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.tid.z">;
-def NVVM_BlockDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.x">;
-def NVVM_BlockDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.y">;
-def NVVM_BlockDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ntid.z">;
+def NVVM_ThreadIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.x">;
+def NVVM_ThreadIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.y">;
+def NVVM_ThreadIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.tid.z">;
+def NVVM_BlockDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.x">;
+def NVVM_BlockDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.y">;
+def NVVM_BlockDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ntid.z">;
 
 //===----------------------------------------------------------------------===//
 // Block index and range
-def NVVM_BlockIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.x">;
-def NVVM_BlockIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.y">;
-def NVVM_BlockIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.ctaid.z">;
-def NVVM_GridDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.x">;
-def NVVM_GridDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.y">;
-def NVVM_GridDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nctaid.z">;
+def NVVM_BlockIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.x">;
+def NVVM_BlockIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.y">;
+def NVVM_BlockIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.ctaid.z">;
+def NVVM_GridDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.x">;
+def NVVM_GridDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.y">;
+def NVVM_GridDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA Cluster index and range
-def NVVM_ClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.x">;
-def NVVM_ClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.y">;
-def NVVM_ClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clusterid.z">;
-def NVVM_ClusterDimXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.x">;
-def NVVM_ClusterDimYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.y">;
-def NVVM_ClusterDimZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.nclusterid.z">;
+def NVVM_ClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.x">;
+def NVVM_ClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.y">;
+def NVVM_ClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.clusterid.z">;
+def NVVM_ClusterDimXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.x">;
+def NVVM_ClusterDimYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.y">;
+def NVVM_ClusterDimZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nclusterid.z">;
 
 
 //===----------------------------------------------------------------------===//
 // CTA index and range within Cluster
-def NVVM_BlockInClusterIdXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
-def NVVM_BlockInClusterIdYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
-def NVVM_BlockInClusterIdZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
-def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
-def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
-def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
+def NVVM_BlockInClusterIdXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.x">;
+def NVVM_BlockInClusterIdYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.y">;
+def NVVM_BlockInClusterIdZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctaid.z">;
+def NVVM_ClusterDimBlocksXOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.x">;
+def NVVM_ClusterDimBlocksYOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.y">;
+def NVVM_ClusterDimBlocksZOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctaid.z">;
 
 //===----------------------------------------------------------------------===//
 // CTA index and across Cluster dimensions
-def NVVM_ClusterId : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.ctarank">;
-def NVVM_ClusterDim : NVVM_SpecialRegisterOp<"read.ptx.sreg.cluster.nctarank">;
+def NVVM_ClusterId : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.ctarank">;
+def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nctarank">;
 
 //===----------------------------------------------------------------------===//
 // Clock registers
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 9b1be198f77a8..164622d77e6b6 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -29,6 +29,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Dialect/Math/IR/Math.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/NVGPU/IR/NVGPUDialect.h"
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
@@ -209,7 +210,15 @@ struct GPULaneIdOpToNVVM : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
                   ConversionPatternRewriter &rewriter) const override {
     auto loc = op->getLoc();
     MLIRContext *context = rewriter.getContext();
-    Value newOp = rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type());
+    LLVM::ConstantRangeAttr bounds = nullptr;
+    if (std::optional<APInt> upperBound = op.getUpperBound())
+      bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+          /*bitWidth=*/32, /*lower=*/0, upperBound->getZExtValue());
+    else
+      bounds = rewriter.getAttr<LLVM::ConstantRangeAttr>(
+          /*bitWidth=*/32, /*lower=*/0, /*upper=*/kWarpSize);
+    Value newOp =
+        rewriter.create<NVVM::LaneIdOp>(loc, rewriter.getI32Type(), bounds);
     // Truncate or extend the result depending on the index bitwidth specified
     // by the LLVMTypeConverter options.
     const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
@@ -340,27 +349,40 @@ void mlir::populateGpuSubgroupReduceOpLoweringPattern(
 
 void mlir::populateGpuToNVVMConversionPatterns(LLVMTypeConverter &converter,
                                                RewritePatternSet &patterns) {
+  using gpu::index_lowering::IndexKind;
+  using gpu::index_lowering::IntrType;
   populateWithGenerated(patterns);
   patterns.add<GPUPrintfOpToVPrintfLowering>(converter);
   patterns.add<
       gpu::index_lowering::OpLowering<gpu::ThreadIdOp, NVVM::ThreadIdXOp,
-                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>,
+                                      NVVM::ThreadIdYOp, NVVM::ThreadIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::BlockDimOp, NVVM::BlockDimXOp,
-                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>,
+                                      NVVM::BlockDimYOp, NVVM::BlockDimZOp>>(
+      converter, IndexKind::Block, IntrType::Dim);
+  patterns.add<
       gpu::index_lowering::OpLowering<gpu::ClusterIdOp, NVVM::ClusterIdXOp,
-                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<
-          gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
-          NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::ClusterDimOp, NVVM::ClusterDimXOp,
-                                      NVVM::ClusterDimYOp, NVVM::ClusterDimZOp>,
-      gpu::index_lowering::OpLowering<gpu::BlockIdOp, NVVM::BlockIdXOp,
-                                      NVVM::BlockIdYOp, NVVM::BlockIdZOp>,
-      gpu::index_lowering::OpLowering<gpu::GridDimOp, NVVM::GridDimXOp,
-                                      NVVM::GridDimYOp, NVVM::GridDimZOp>,
-      GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(converter);
+                                      NVVM::ClusterIdYOp, NVVM::ClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterBlockIdOp, NVVM::BlockInClusterIdXOp,
+      NVVM::BlockInClusterIdYOp, NVVM::BlockInClusterIdZOp>>(
+      converter, IndexKind::Other, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::ClusterDimOp, NVVM::ClusterDimXOp, NVVM::ClusterDimYOp,
+      NVVM::ClusterDimZOp>>(converter, IndexKind::Other, IntrType::Dim);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::BlockIdOp, NVVM::BlockIdXOp, NVVM::BlockIdYOp, NVVM::BlockIdZOp>>(
+      converter, IndexKind::Block, IntrType::Id);
+  patterns.add<gpu::index_lowering::OpLowering<
+      gpu::GridDimOp, NVVM::GridDimXOp, NVVM::GridDimYOp, NVVM::GridDimZOp>>(
+      converter, IndexKind::Grid, IntrType::Dim);
+  patterns.add<GPULaneIdOpToNVVM, GPUShuffleOpLowering, GPUReturnOpLowering>(
+      converter);
 
   patterns.add<GPUDynamicSharedMemoryOpLowering>(
       converter, NVVM::kSharedMemoryAlignmentBit);
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index 855abc12a909e..bc830a77f3c58 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -14,6 +14,7 @@
 #include "mlir/Dialect/LLVMIR/NVVMDialect.h"
 #include "mlir/Target/LLVMIR/ModuleImport.h"
 
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/IntrinsicsNVPTX.h"
 
 using namespace mlir;
diff --git a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
index 8f2ec289c9252..66ad1e307fc3a 100644
--- a/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/GPUToNVVM/gpu-to-nvvm.mlir
@@ -50,7 +50,7 @@ gpu.module @test_module_0 {
     %gDimZ = gpu.grid_dim z
 
 
-    // CHECK: = nvvm.read.ptx.sreg.laneid : i32
+    // CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 32> : i32
     // CHECK: = llvm.sext %{{.*}} : i32 to i64
     %laneId = gpu.lane_id
 
@@ -699,9 +699,21 @@ gpu.module @test_module_32 {
 }
 
 gpu.module @test_module_33 {
-// CHECK-LABEL: func @kernel_with_block_size()
-// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 128, 1, 1>, nvvm.kernel, nvvm.maxntid = array<i32: 128, 1, 1>}
-  gpu.func @kernel_with_block_size() kernel attributes {known_block_size = array<i32: 128, 1, 1>} {
+// CHECK-LABEL: func @kernel_with_block_size(
+// CHECK: attributes {gpu.kernel, gpu.known_block_size = array<i32: 32, 4, 2>, nvvm.kernel, nvvm.maxntid = array<i32: 32, 4, 2>}
+  gpu.func @kernel_with_block_size(%arg0: !llvm.ptr) kernel attributes {known_block_size = array<i32: 32, 4, 2>} {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+    %0 = gpu.thread_id x
+    // CHECK: = nvvm.read.ptx.sreg.tid.y range <i32, 0, 4> : i32
+    %1 = gpu.thread_id y
+    // CHECK: = nvvm.read.ptx.sreg.tid.z range <i32, 0, 2> : i32
+    %2 = gpu.thread_id z
+
+    // Fake usage to prevent dead code elimination
+    %3 = arith.addi %0, %1 : index
+    %4 = arith.addi %3, %2 : index
+    %5 = arith.index_cast %4 : index to i64
+    llvm.store %5, %arg0 : i64, !llvm.ptr
     gpu.return
   }
 }
@@ -917,6 +929,20 @@ gpu.module @test_module_48 {
   }
 }
 
+gpu.module @test_module_49 {
+// CHECK-LABEL: func @explicit_id_bounds()
+  func.func @explicit_id_bounds() -> (index, index, index) {
+    // CHECK: = nvvm.read.ptx.sreg.tid.x range <i32, 0, 32> : i32
+    %0 = gpu.thread_id x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.ntid.x range <i32, 1, 33> : i32
+    %1 = gpu.block_dim x upper_bound 32
+    // CHECK: = nvvm.read.ptx.sreg.laneid range <i32, 0, 16> : i32
+    %2 = gpu.lane_id upper_bound 16
+
+    return %0, %1, %2 : index, index, index
+  }
+}
+
 module attributes {transform.with_named_sequence} {
   transform.named_sequence @__transform_main(%toplevel_module: !transform.any_op {transform.readonly}) {
     %gpu_module = transform.structured.match ops{["gpu.module"]} in %toplevel_module
diff --git a/mlir/test/Target/LLVMIR/Import/nvvmir.ll b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
index e4a8773e2dd80..131e9065b2d88 100644
--- a/mlir/test/Target/LLVMIR/Import/nvvmir.ll
+++ b/mlir/test/Target/LLVMIR/Import/nvvmir.ll
@@ -58,6 +58,9 @@ define i32 @nvvm_special_regs() {
   %27 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.ctarank()
   ; CHECK: = nvvm.read.ptx.sreg.cluster.nctarank : i32
   %28 = call i32 @llvm.nvvm.read.ptx.sreg.cluster.nctarank()
+
+  ; CHECK = nvvm.read.ptx.sreg.tid.x range <0 : i32, 64 : i32> : i32
+  %29 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   ret i32 %1
 }
 
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 88ffb1c7bfdf7..7fd082a5eb3c7 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -62,7 +62,10 @@ llvm.func @nvvm_special_regs() -> i32 {
   %29 = nvvm.read.ptx.sreg.clock : i32
   // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64
   %30 = nvvm.read.ptx.sreg.clock64 : i64
-  
+
+  // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %31 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
+
   llvm.return %1 : i32
 }
 

From 02d8813820b1ebf3fae6993e677db269f0077272 Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Fri, 13 Sep 2024 10:18:03 -0700
Subject: [PATCH 35/43] Add a comment in the SB API doc about keeping the SB
 API's lightweight. (#108462)

---
 lldb/docs/resources/sbapi.rst | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/lldb/docs/resources/sbapi.rst b/lldb/docs/resources/sbapi.rst
index cf32cc6c81558..4ca3909e0f291 100644
--- a/lldb/docs/resources/sbapi.rst
+++ b/lldb/docs/resources/sbapi.rst
@@ -72,6 +72,17 @@ building the LLDB framework for macOS, the headers are processed with
 ``unifdef`` prior to being copied into the framework bundle to remove macros
 involving SWIG.
 
+Another good principle when adding SB API methods is: if you find yourself
+implementing a significant algorithm in the SB API method, you should not do
+that, but instead look for and then add it - if not found - as a method in the
+underlying lldb_private class, and then call that from your SB API method.
+If it was a useful algorithm, it's very likely it already exists
+because the lldb_private code also needed to do it.  And if it doesn't at
+present, if it was a useful thing to do, it's likely someone will later need
+it in lldb_private and then we end up with two implementations of the same
+algorithm.  If we keep the SB API code to just what's needed to manage the SB
+objects and requests, we won't get into this situation.
+
 Lifetime
 --------
 Many SB API methods will return strings in the form of ``const char *`` values.

From b6bf27ef3c179eefd805f39aa681705fc980ceed Mon Sep 17 00:00:00 2001
From: jeffreytan81 <jeffreytan@meta.com>
Date: Fri, 13 Sep 2024 10:26:01 -0700
Subject: [PATCH 36/43] Avoid expression evaluation in libStdC++
 std::vector<bool> synthetic children provider  (#108414)

Our customers is reporting a serious performance issue (expanding a this
pointer takes 70 seconds in VSCode) in a specific execution context.

Profiling shows the hot path is triggered by an expression evaluation
from libStdC++ synthetic children provider for `std::vector<bool>` since
it uses `CreateValueFromExpression()`.

This PR added a new `SBValue::CreateBoolValue()` API and switch
`std::vector<bool>` synthetic children provider to use the new API
without performing expression evaluation.

Note: there might be other cases of `CreateValueFromExpression()` in our
summary/synthetic children providers which I will sweep through in later
PRs.

With this PR, the customer's scenario reduces from 70 seconds => 50
seconds. I will add other PRs to further optimize the remaining 50
seconds (mostly from type/namespace lookup).

Testing:

`test/API/functionalities/data-formatter/data-formatter-stl/libstdcpp/vbool/TestDataFormatterStdVBool.py`
passes with the PR

---------

Co-authored-by: jeffreytan81 <jeffreytan@fb.com>
---
 lldb/examples/synthetic/gnu_libstdcpp.py |  6 +-----
 lldb/include/lldb/API/SBValue.h          |  2 ++
 lldb/source/API/SBValue.cpp              | 16 ++++++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/lldb/examples/synthetic/gnu_libstdcpp.py b/lldb/examples/synthetic/gnu_libstdcpp.py
index d98495b8a9df3..a6605a7a7eb5b 100644
--- a/lldb/examples/synthetic/gnu_libstdcpp.py
+++ b/lldb/examples/synthetic/gnu_libstdcpp.py
@@ -473,11 +473,7 @@ def get_child_at_index(self, index):
                 "[" + str(index) + "]", element_offset, element_type
             )
             bit = element.GetValueAsUnsigned(0) & (1 << bit_offset)
-            if bit != 0:
-                value_expr = "(bool)true"
-            else:
-                value_expr = "(bool)false"
-            return self.valobj.CreateValueFromExpression("[%d]" % index, value_expr)
+            return self.valobj.CreateBoolValue("[%d]" % index, bool(bit))
 
         def update(self):
             try:
diff --git a/lldb/include/lldb/API/SBValue.h b/lldb/include/lldb/API/SBValue.h
index bec816fb45184..9090cece80f7c 100644
--- a/lldb/include/lldb/API/SBValue.h
+++ b/lldb/include/lldb/API/SBValue.h
@@ -145,6 +145,8 @@ class LLDB_API SBValue {
   // AddressOf() on the return of this call all return invalid
   lldb::SBValue CreateValueFromData(const char *name, lldb::SBData data,
                                     lldb::SBType type);
+  // Returned value has no address.
+  lldb::SBValue CreateBoolValue(const char *name, bool value);
 
   /// Get a child value by index from a value.
   ///
diff --git a/lldb/source/API/SBValue.cpp b/lldb/source/API/SBValue.cpp
index 273aac5ad4798..e1a31708d46ff 100644
--- a/lldb/source/API/SBValue.cpp
+++ b/lldb/source/API/SBValue.cpp
@@ -645,6 +645,22 @@ lldb::SBValue SBValue::CreateValueFromData(const char *name, SBData data,
   return sb_value;
 }
 
+lldb::SBValue SBValue::CreateBoolValue(const char *name, bool value) {
+  LLDB_INSTRUMENT_VA(this, name);
+
+  lldb::SBValue sb_value;
+  lldb::ValueObjectSP new_value_sp;
+  ValueLocker locker;
+  lldb::ValueObjectSP value_sp(GetSP(locker));
+  lldb::TargetSP target_sp = m_opaque_sp->GetTargetSP();
+  if (value_sp && target_sp) {
+    new_value_sp =
+        ValueObject::CreateValueObjectFromBool(target_sp, value, name);
+  }
+  sb_value.SetSP(new_value_sp);
+  return sb_value;
+}
+
 SBValue SBValue::GetChildAtIndex(uint32_t idx) {
   LLDB_INSTRUMENT_VA(this, idx);
 

From 0351dc522a25df0473a63b414a5bfde5814d3dc3 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Fri, 13 Sep 2024 10:33:43 -0700
Subject: [PATCH 37/43] [lldb] Do not use LC_FUNCTION_STARTS data to determine
 symbol size as symbols are created (#106791)

Summary:
This improves the performance of ObjectFileMacho::ParseSymtab by
removing eager and expensive work in favor of doing it later in a
less-expensive fashion.

Experiment:
My goal was to understand LLDB's startup time.
First, I produced a Debug build of LLDB (no dSYM) and a
Release+NoAsserts build of LLDB. The Release build debugged the Debug
build as it debugged a small C++ program. I found that
ObjectFileMachO::ParseSymtab accounted for somewhere between 1.2 and 1.3
seconds consistently. After applying this change, I consistently
measured a reduction of approximately 100ms, putting the time closer to
1.1s and 1.2s on average.

Background:
ObjectFileMachO::ParseSymtab will incrementally create symbols by
parsing nlist entries from the symtab section of a MachO binary. As it
does this, it eagerly tries to determine the size of symbols (e.g. how
long a function is) using LC_FUNCTION_STARTS data (or eh_frame if
LC_FUNCTION_STARTS is unavailable). Concretely, this is done by
performing a binary search on the function starts array and calculating
the distance to the next function or the end of the section (whichever
is smaller).

However, this work is unnecessary for 2 reasons:
1. If you have debug symbol entries (i.e. STABs), the size of a function
is usually stored right after the function's entry. Performing this work
right before parsing the next entry is unnecessary work.
2. Calculating symbol sizes for symbols of size 0 is already performed
in `Symtab::InitAddressIndexes` after all the symbols are added to the
Symtab. It also does this more efficiently by walking over a list of
symbols sorted by address, so the work to calculate the size per symbol
is constant instead of O(log n).
---
 .../ObjectFile/Mach-O/ObjectFileMachO.cpp     | 63 -------------------
 1 file changed, 63 deletions(-)

diff --git a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
index 06da83e26a26a..c36748963db37 100644
--- a/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
+++ b/lldb/source/Plugins/ObjectFile/Mach-O/ObjectFileMachO.cpp
@@ -3768,7 +3768,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       SymbolType type = eSymbolTypeInvalid;
       SectionSP symbol_section;
-      lldb::addr_t symbol_byte_size = 0;
       bool add_nlist = true;
       bool is_gsym = false;
       bool demangled_is_synthesized = false;
@@ -4354,47 +4353,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
 
       if (symbol_section) {
         const addr_t section_file_addr = symbol_section->GetFileAddress();
-        if (symbol_byte_size == 0 && function_starts_count > 0) {
-          addr_t symbol_lookup_file_addr = nlist.n_value;
-          // Do an exact address match for non-ARM addresses, else get the
-          // closest since the symbol might be a thumb symbol which has an
-          // address with bit zero set.
-          FunctionStarts::Entry *func_start_entry =
-              function_starts.FindEntry(symbol_lookup_file_addr, !is_arm);
-          if (is_arm && func_start_entry) {
-            // Verify that the function start address is the symbol address
-            // (ARM) or the symbol address + 1 (thumb).
-            if (func_start_entry->addr != symbol_lookup_file_addr &&
-                func_start_entry->addr != (symbol_lookup_file_addr + 1)) {
-              // Not the right entry, NULL it out...
-              func_start_entry = nullptr;
-            }
-          }
-          if (func_start_entry) {
-            func_start_entry->data = true;
-
-            addr_t symbol_file_addr = func_start_entry->addr;
-            if (is_arm)
-              symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-
-            const FunctionStarts::Entry *next_func_start_entry =
-                function_starts.FindNextEntry(func_start_entry);
-            const addr_t section_end_file_addr =
-                section_file_addr + symbol_section->GetByteSize();
-            if (next_func_start_entry) {
-              addr_t next_symbol_file_addr = next_func_start_entry->addr;
-              // Be sure the clear the Thumb address bit when we calculate the
-              // size from the current and next address
-              if (is_arm)
-                next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-              symbol_byte_size = std::min<lldb::addr_t>(
-                  next_symbol_file_addr - symbol_file_addr,
-                  section_end_file_addr - symbol_file_addr);
-            } else {
-              symbol_byte_size = section_end_file_addr - symbol_file_addr;
-            }
-          }
-        }
         symbol_value -= section_file_addr;
       }
 
@@ -4501,9 +4459,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
       if (nlist.n_desc & N_WEAK_REF)
         sym[sym_idx].SetIsWeak(true);
 
-      if (symbol_byte_size > 0)
-        sym[sym_idx].SetByteSize(symbol_byte_size);
-
       if (demangled_is_synthesized)
         sym[sym_idx].SetDemangledNameIsSynthesized(true);
 
@@ -4622,23 +4577,7 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
           Address symbol_addr;
           if (module_sp->ResolveFileAddress(symbol_file_addr, symbol_addr)) {
             SectionSP symbol_section(symbol_addr.GetSection());
-            uint32_t symbol_byte_size = 0;
             if (symbol_section) {
-              const addr_t section_file_addr = symbol_section->GetFileAddress();
-              const FunctionStarts::Entry *next_func_start_entry =
-                  function_starts.FindNextEntry(func_start_entry);
-              const addr_t section_end_file_addr =
-                  section_file_addr + symbol_section->GetByteSize();
-              if (next_func_start_entry) {
-                addr_t next_symbol_file_addr = next_func_start_entry->addr;
-                if (is_arm)
-                  next_symbol_file_addr &= THUMB_ADDRESS_BIT_MASK;
-                symbol_byte_size = std::min<lldb::addr_t>(
-                    next_symbol_file_addr - symbol_file_addr,
-                    section_end_file_addr - symbol_file_addr);
-              } else {
-                symbol_byte_size = section_end_file_addr - symbol_file_addr;
-              }
               sym[sym_idx].SetID(synthetic_sym_id++);
               // Don't set the name for any synthetic symbols, the Symbol
               // object will generate one if needed when the name is accessed
@@ -4650,8 +4589,6 @@ void ObjectFileMachO::ParseSymtab(Symtab &symtab) {
               add_symbol_addr(symbol_addr.GetFileAddress());
               if (symbol_flags)
                 sym[sym_idx].SetFlags(symbol_flags);
-              if (symbol_byte_size)
-                sym[sym_idx].SetByteSize(symbol_byte_size);
               ++sym_idx;
             }
           }

From 3a274584ebbcad6500efc4083bb53c1af565e294 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 13 Sep 2024 10:41:45 -0700
Subject: [PATCH 38/43] [LiveDebugValues] Avoid repeated hash lookups (NFC)
 (#108484)

---
 llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
index a69dbbbbdab3c..a73a3aa59403b 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/InstrRefBasedImpl.cpp
@@ -2231,11 +2231,9 @@ void InstrRefBasedLDV::accumulateFragmentMap(MachineInstr &MI) {
   // If this is the first sighting of this variable, then we are guaranteed
   // there are currently no overlapping fragments either. Initialize the set
   // of seen fragments, record no overlaps for the current one, and return.
-  auto SeenIt = SeenFragments.find(MIVar.getVariable());
-  if (SeenIt == SeenFragments.end()) {
-    SmallSet<FragmentInfo, 4> OneFragment;
-    OneFragment.insert(ThisFragment);
-    SeenFragments.insert({MIVar.getVariable(), OneFragment});
+  auto [SeenIt, Inserted] = SeenFragments.try_emplace(MIVar.getVariable());
+  if (Inserted) {
+    SeenIt->second.insert(ThisFragment);
 
     OverlapFragments.insert({{MIVar.getVariable(), ThisFragment}, {}});
     return;

From 758230827d59ab312515e7ad9e6d25b799dedd46 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 13 Sep 2024 16:42:08 +0100
Subject: [PATCH 39/43] [AArch64][GISel] Scalarize i128 vector shifts.

Like most other i128 operations, this adds scalarization for i128 vector
shifts. Which in turn allows a few other operations to legalize too.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   3 +-
 llvm/test/CodeGen/AArch64/abs.ll              |  34 +++
 llvm/test/CodeGen/AArch64/fcmp.ll             | 279 +++++++++++++-----
 llvm/test/CodeGen/AArch64/shift.ll            | 175 ++++++++++-
 4 files changed, 419 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index db5cd1d32d73d..623e59c4be805 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -179,7 +179,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0)
-      .minScalarSameAs(1, 0);
+      .minScalarSameAs(1, 0)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
   getActionDefinitionsBuilder(G_PTR_ADD)
       .legalFor({{p0, s64}, {v2p0, v2s64}})
diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll
index 6da019a79b727..25a14ef9a49ee 100644
--- a/llvm/test/CodeGen/AArch64/abs.ll
+++ b/llvm/test/CodeGen/AArch64/abs.ll
@@ -280,6 +280,40 @@ entry:
 }
 declare <4 x i64> @llvm.abs.v4i64(<4 x i64>, i1)
 
+define <2 x i128> @abs_v4i128(<2 x i128> %a){
+; CHECK-SD-LABEL: abs_v4i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    asr x8, x1, #63
+; CHECK-SD-NEXT:    asr x9, x3, #63
+; CHECK-SD-NEXT:    eor x10, x0, x8
+; CHECK-SD-NEXT:    eor x11, x1, x8
+; CHECK-SD-NEXT:    subs x0, x10, x8
+; CHECK-SD-NEXT:    eor x10, x2, x9
+; CHECK-SD-NEXT:    sbc x1, x11, x8
+; CHECK-SD-NEXT:    eor x8, x3, x9
+; CHECK-SD-NEXT:    subs x2, x10, x9
+; CHECK-SD-NEXT:    sbc x3, x8, x9
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: abs_v4i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    asr x8, x1, #63
+; CHECK-GI-NEXT:    asr x9, x3, #63
+; CHECK-GI-NEXT:    adds x10, x0, x8
+; CHECK-GI-NEXT:    adc x11, x1, x8
+; CHECK-GI-NEXT:    adds x12, x2, x9
+; CHECK-GI-NEXT:    eor x0, x10, x8
+; CHECK-GI-NEXT:    adc x13, x3, x9
+; CHECK-GI-NEXT:    eor x1, x11, x8
+; CHECK-GI-NEXT:    eor x2, x12, x9
+; CHECK-GI-NEXT:    eor x3, x13, x9
+; CHECK-GI-NEXT:    ret
+entry:
+  %res = call <2 x i128> @llvm.abs.v2i128(<2 x i128> %a, i1 0)
+  ret <2 x i128> %res
+}
+declare <2 x i128> @llvm.abs.v2i128(<2 x i128>, i1)
+
 ; ===== Vectors with Non-Pow 2 Widths =====
 
 define <3 x i8> @abs_v3i8(<3 x i8> %a){
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 8ca1e9ee5b617..5e44da5fcfa2d 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -1,11 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-NOFP16
 ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD,CHECK-SD-FP16
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
-; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
-
-; CHECK-GI: warning: Instruction selection used fallback path for v2f128_fp128
-; CHECK-GI-NEXT: warning: Instruction selection used fallback path for v3f128_fp128
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-NOFP16
+; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16
 
 define fp128 @f128_fp128(fp128 %a, fp128 %b, fp128 %d, fp128 %e) {
 ; CHECK-SD-LABEL: f128_fp128:
@@ -429,35 +426,90 @@ entry:
 }
 
 define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d, <2 x fp128> %e) {
-; CHECK-LABEL: v2f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q4, q5, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    stp q1, q3, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v1.16b, v2.16b
-; CHECK-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB12_2
-; CHECK-NEXT:  // %bb.1: // %entry
-; CHECK-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB12_2: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.ge .LBB12_4
-; CHECK-NEXT:  // %bb.3: // %entry
-; CHECK-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:  .LBB12_4: // %entry
-; CHECK-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v2f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #112
+; CHECK-SD-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q4, q5, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    stp q1, q3, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov v1.16b, v2.16b
+; CHECK-SD-NEXT:    stp q7, q6, [sp, #64] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB12_2
+; CHECK-SD-NEXT:  // %bb.1: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB12_2: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    ldr q1, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.ge .LBB12_4
+; CHECK-SD-NEXT:  // %bb.3: // %entry
+; CHECK-SD-NEXT:    ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:  .LBB12_4: // %entry
+; CHECK-SD-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #112
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #112
+; CHECK-GI-NEXT:    stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w30, -16
+; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v2.16b
+; CHECK-GI-NEXT:    stp q4, q5, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    bfi x19, x8, #32, #32
+; CHECK-GI-NEXT:    cset w8, lt
+; CHECK-GI-NEXT:    fmov x10, d0
+; CHECK-GI-NEXT:    mov x11, v0.d[1]
+; CHECK-GI-NEXT:    bfi x8, x8, #32, #32
+; CHECK-GI-NEXT:    ldp q0, q1, [sp, #48] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    lsl x9, x19, #63
+; CHECK-GI-NEXT:    lsl x8, x8, #63
+; CHECK-GI-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    asr x9, x9, #63
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    mov x13, v0.d[1]
+; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x14, d1
+; CHECK-GI-NEXT:    asr x8, x8, #63
+; CHECK-GI-NEXT:    and x10, x10, x9
+; CHECK-GI-NEXT:    fmov x15, d0
+; CHECK-GI-NEXT:    mov x16, v1.d[1]
+; CHECK-GI-NEXT:    mov x17, v0.d[1]
+; CHECK-GI-NEXT:    and x12, x12, x8
+; CHECK-GI-NEXT:    bic x14, x14, x9
+; CHECK-GI-NEXT:    bic x15, x15, x8
+; CHECK-GI-NEXT:    orr x10, x10, x14
+; CHECK-GI-NEXT:    orr x12, x12, x15
+; CHECK-GI-NEXT:    mov v0.d[0], x10
+; CHECK-GI-NEXT:    and x10, x11, x9
+; CHECK-GI-NEXT:    mov v1.d[0], x12
+; CHECK-GI-NEXT:    and x11, x13, x8
+; CHECK-GI-NEXT:    bic x9, x16, x9
+; CHECK-GI-NEXT:    bic x8, x17, x8
+; CHECK-GI-NEXT:    orr x9, x10, x9
+; CHECK-GI-NEXT:    orr x8, x11, x8
+; CHECK-GI-NEXT:    mov v0.d[1], x9
+; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    add sp, sp, #112
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <2 x fp128> %a, %b
   %s = select <2 x i1> %c, <2 x fp128> %d, <2 x fp128> %e
@@ -465,42 +517,129 @@ entry:
 }
 
 define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d, <3 x fp128> %e) {
-; CHECK-LABEL: v3f128_fp128:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    sub sp, sp, #112
-; CHECK-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
-; CHECK-NEXT:    .cfi_def_cfa_offset 112
-; CHECK-NEXT:    .cfi_offset w30, -16
-; CHECK-NEXT:    stp q1, q4, [sp] // 32-byte Folded Spill
-; CHECK-NEXT:    mov v1.16b, v3.16b
-; CHECK-NEXT:    stp q2, q5, [sp, #32] // 32-byte Folded Spill
-; CHECK-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.lt .LBB13_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    ldr q0, [sp, #128]
-; CHECK-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB13_2: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    b.lt .LBB13_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    ldr q0, [sp, #144]
-; CHECK-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
-; CHECK-NEXT:  .LBB13_4: // %entry
-; CHECK-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
-; CHECK-NEXT:    bl __lttf2
-; CHECK-NEXT:    add x8, sp, #160
-; CHECK-NEXT:    cmp w0, #0
-; CHECK-NEXT:    add x9, sp, #112
-; CHECK-NEXT:    csel x8, x9, x8, lt
-; CHECK-NEXT:    ldp q0, q1, [sp, #64] // 32-byte Folded Reload
-; CHECK-NEXT:    ldr q2, [x8]
-; CHECK-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
-; CHECK-NEXT:    add sp, sp, #112
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3f128_fp128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    sub sp, sp, #112
+; CHECK-SD-NEXT:    str x30, [sp, #96] // 8-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 112
+; CHECK-SD-NEXT:    .cfi_offset w30, -16
+; CHECK-SD-NEXT:    stp q1, q4, [sp] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    mov v1.16b, v3.16b
+; CHECK-SD-NEXT:    stp q2, q5, [sp, #32] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.lt .LBB13_2
+; CHECK-SD-NEXT:  // %bb.1:
+; CHECK-SD-NEXT:    ldr q0, [sp, #128]
+; CHECK-SD-NEXT:    str q0, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB13_2: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    b.lt .LBB13_4
+; CHECK-SD-NEXT:  // %bb.3:
+; CHECK-SD-NEXT:    ldr q0, [sp, #144]
+; CHECK-SD-NEXT:    str q0, [sp, #80] // 16-byte Folded Spill
+; CHECK-SD-NEXT:  .LBB13_4: // %entry
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #32] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    bl __lttf2
+; CHECK-SD-NEXT:    add x8, sp, #160
+; CHECK-SD-NEXT:    cmp w0, #0
+; CHECK-SD-NEXT:    add x9, sp, #112
+; CHECK-SD-NEXT:    csel x8, x9, x8, lt
+; CHECK-SD-NEXT:    ldp q0, q1, [sp, #64] // 32-byte Folded Reload
+; CHECK-SD-NEXT:    ldr q2, [x8]
+; CHECK-SD-NEXT:    ldr x30, [sp, #96] // 8-byte Folded Reload
+; CHECK-SD-NEXT:    add sp, sp, #112
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3f128_fp128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    sub sp, sp, #192
+; CHECK-GI-NEXT:    str x30, [sp, #160] // 8-byte Folded Spill
+; CHECK-GI-NEXT:    stp x20, x19, [sp, #176] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    .cfi_def_cfa_offset 192
+; CHECK-GI-NEXT:    .cfi_offset w19, -8
+; CHECK-GI-NEXT:    .cfi_offset w20, -16
+; CHECK-GI-NEXT:    .cfi_offset w30, -32
+; CHECK-GI-NEXT:    stp q4, q1, [sp] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    mov v1.16b, v3.16b
+; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #192]
+; CHECK-GI-NEXT:    str q7, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp q6, q2, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #208]
+; CHECK-GI-NEXT:    str q2, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #224]
+; CHECK-GI-NEXT:    str q2, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    ldr q2, [sp, #240]
+; CHECK-GI-NEXT:    str q2, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    cset w20, lt
+; CHECK-GI-NEXT:    bl __lttf2
+; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bfi x19, x8, #32, #32
+; CHECK-GI-NEXT:    bfi x20, x8, #32, #32
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldr x30, [sp, #160] // 8-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x10, v0.d[1]
+; CHECK-GI-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    cset w9, lt
+; CHECK-GI-NEXT:    lsl x13, x19, #63
+; CHECK-GI-NEXT:    lsl x14, x20, #63
+; CHECK-GI-NEXT:    fmov x11, d0
+; CHECK-GI-NEXT:    mov x12, v0.d[1]
+; CHECK-GI-NEXT:    ldr q0, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    bfi x9, x8, #32, #32
+; CHECK-GI-NEXT:    asr x13, x13, #63
+; CHECK-GI-NEXT:    asr x14, x14, #63
+; CHECK-GI-NEXT:    fmov x15, d0
+; CHECK-GI-NEXT:    mov x16, v0.d[1]
+; CHECK-GI-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    lsl x9, x9, #63
+; CHECK-GI-NEXT:    and x8, x8, x13
+; CHECK-GI-NEXT:    and x11, x11, x14
+; CHECK-GI-NEXT:    asr x9, x9, #63
+; CHECK-GI-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x17, d0
+; CHECK-GI-NEXT:    mov x18, v0.d[1]
+; CHECK-GI-NEXT:    ldr q0, [sp, #144] // 16-byte Folded Reload
+; CHECK-GI-NEXT:    fmov x0, d1
+; CHECK-GI-NEXT:    and x15, x15, x9
+; CHECK-GI-NEXT:    mov x2, v1.d[1]
+; CHECK-GI-NEXT:    fmov x1, d0
+; CHECK-GI-NEXT:    mov x3, v0.d[1]
+; CHECK-GI-NEXT:    bic x17, x17, x13
+; CHECK-GI-NEXT:    bic x0, x0, x14
+; CHECK-GI-NEXT:    orr x8, x8, x17
+; CHECK-GI-NEXT:    bic x1, x1, x9
+; CHECK-GI-NEXT:    orr x11, x11, x0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    orr x15, x15, x1
+; CHECK-GI-NEXT:    mov v1.d[0], x11
+; CHECK-GI-NEXT:    and x8, x10, x13
+; CHECK-GI-NEXT:    mov v2.d[0], x15
+; CHECK-GI-NEXT:    and x10, x12, x14
+; CHECK-GI-NEXT:    and x11, x16, x9
+; CHECK-GI-NEXT:    bic x12, x18, x13
+; CHECK-GI-NEXT:    bic x13, x2, x14
+; CHECK-GI-NEXT:    bic x9, x3, x9
+; CHECK-GI-NEXT:    orr x8, x8, x12
+; CHECK-GI-NEXT:    orr x10, x10, x13
+; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x10
+; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    add sp, sp, #192
+; CHECK-GI-NEXT:    ret
 entry:
   %c = fcmp olt <3 x fp128> %a, %b
   %s = select <3 x i1> %c, <3 x fp128> %d, <3 x fp128> %e
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 951458da17c07..7014a4a9acbe0 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i1 @shl_i1(i1 %0, i1 %1){
@@ -674,6 +674,61 @@ define <4 x i64> @shl_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @shl_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: shl_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsl x10, x1, x4
+; CHECK-SD-NEXT:    mvn w12, w6
+; CHECK-SD-NEXT:    lsl x11, x0, x4
+; CHECK-SD-NEXT:    lsl x13, x3, x6
+; CHECK-SD-NEXT:    lsr x8, x8, x9
+; CHECK-SD-NEXT:    lsr x9, x2, #1
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    csel x0, xzr, x11, ne
+; CHECK-SD-NEXT:    lsr x9, x9, x12
+; CHECK-SD-NEXT:    orr x8, x10, x8
+; CHECK-SD-NEXT:    lsl x10, x2, x6
+; CHECK-SD-NEXT:    csel x1, x11, x8, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    orr x8, x13, x9
+; CHECK-SD-NEXT:    csel x2, xzr, x10, ne
+; CHECK-SD-NEXT:    csel x3, x10, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsl x11, x1, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    lsl x10, x0, x10
+; CHECK-GI-NEXT:    lsl x12, x0, x4
+; CHECK-GI-NEXT:    lsr x9, x0, x9
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    lsr x8, x2, x8
+; CHECK-GI-NEXT:    csel x0, x12, xzr, lo
+; CHECK-GI-NEXT:    lsl x12, x2, x6
+; CHECK-GI-NEXT:    orr x9, x9, x11
+; CHECK-GI-NEXT:    lsl x11, x3, x6
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    sub x10, x6, #64
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    lsl x10, x2, x10
+; CHECK-GI-NEXT:    csel x1, x1, x9, eq
+; CHECK-GI-NEXT:    orr x8, x8, x11
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x2, x12, xzr, lo
+; CHECK-GI-NEXT:    csel x8, x8, x10, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x3, x3, x8, eq
+; CHECK-GI-NEXT:    ret
+    %3 = shl <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-SD-LABEL: ashr_v4i8:
 ; CHECK-SD:       // %bb.0:
@@ -819,6 +874,67 @@ define <4 x i64> @ashr_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @ashr_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: ashr_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsl x10, x3, #1
+; CHECK-SD-NEXT:    lsr x11, x0, x4
+; CHECK-SD-NEXT:    lsr x12, x2, x6
+; CHECK-SD-NEXT:    asr x13, x1, #63
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    mvn w9, w6
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    lsl x9, x10, x9
+; CHECK-SD-NEXT:    asr x10, x1, x4
+; CHECK-SD-NEXT:    asr x14, x3, #63
+; CHECK-SD-NEXT:    orr x8, x8, x11
+; CHECK-SD-NEXT:    asr x11, x3, x6
+; CHECK-SD-NEXT:    csel x0, x10, x8, ne
+; CHECK-SD-NEXT:    orr x8, x9, x12
+; CHECK-SD-NEXT:    csel x1, x13, x10, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    csel x2, x11, x8, ne
+; CHECK-SD-NEXT:    csel x3, x14, x11, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsr x11, x0, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    asr x10, x1, x10
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsl x9, x1, x9
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    asr x12, x1, x4
+; CHECK-GI-NEXT:    lsl x8, x3, x8
+; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    asr x11, x1, #63
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    lsr x10, x2, x6
+; CHECK-GI-NEXT:    csel x0, x0, x9, eq
+; CHECK-GI-NEXT:    sub x9, x6, #64
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    asr x9, x3, x9
+; CHECK-GI-NEXT:    csel x1, x12, x11, lo
+; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    asr x11, x3, x6
+; CHECK-GI-NEXT:    asr x10, x3, #63
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x2, x2, x8, eq
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x3, x11, x10, lo
+; CHECK-GI-NEXT:    ret
+    %3 = ashr <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
 ; CHECK-SD-LABEL: lshr_v4i8:
 ; CHECK-SD:       // %bb.0:
@@ -962,6 +1078,63 @@ define <4 x i64> @lshr_v4i64(<4 x i64> %0, <4 x i64> %1){
     ret <4 x i64> %3
 }
 
+define <2 x i128> @lshr_v2i128(<2 x i128> %0, <2 x i128> %1){
+; CHECK-SD-LABEL: lshr_v2i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w4
+; CHECK-SD-NEXT:    lsr x10, x0, x4
+; CHECK-SD-NEXT:    mvn w12, w6
+; CHECK-SD-NEXT:    lsr x11, x1, x4
+; CHECK-SD-NEXT:    lsr x13, x2, x6
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    lsl x9, x3, #1
+; CHECK-SD-NEXT:    tst x4, #0x40
+; CHECK-SD-NEXT:    csel x1, xzr, x11, ne
+; CHECK-SD-NEXT:    lsl x9, x9, x12
+; CHECK-SD-NEXT:    orr x8, x8, x10
+; CHECK-SD-NEXT:    lsr x10, x3, x6
+; CHECK-SD-NEXT:    csel x0, x11, x8, ne
+; CHECK-SD-NEXT:    tst x6, #0x40
+; CHECK-SD-NEXT:    orr x8, x9, x13
+; CHECK-SD-NEXT:    csel x3, xzr, x10, ne
+; CHECK-SD-NEXT:    csel x2, x10, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v2i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x10, x4, #64
+; CHECK-GI-NEXT:    lsr x11, x0, x4
+; CHECK-GI-NEXT:    sub x9, x8, x4
+; CHECK-GI-NEXT:    lsr x10, x1, x10
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsl x9, x1, x9
+; CHECK-GI-NEXT:    sub x8, x8, x6
+; CHECK-GI-NEXT:    lsr x12, x1, x4
+; CHECK-GI-NEXT:    lsl x8, x3, x8
+; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    lsr x11, x2, x6
+; CHECK-GI-NEXT:    csel x9, x9, x10, lo
+; CHECK-GI-NEXT:    cmp x4, #0
+; CHECK-GI-NEXT:    sub x10, x6, #64
+; CHECK-GI-NEXT:    csel x0, x0, x9, eq
+; CHECK-GI-NEXT:    cmp x4, #64
+; CHECK-GI-NEXT:    lsr x9, x3, x10
+; CHECK-GI-NEXT:    csel x1, x12, xzr, lo
+; CHECK-GI-NEXT:    orr x8, x11, x8
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    lsr x10, x3, x6
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x6, #0
+; CHECK-GI-NEXT:    csel x2, x2, x8, eq
+; CHECK-GI-NEXT:    cmp x6, #64
+; CHECK-GI-NEXT:    csel x3, x10, xzr, lo
+; CHECK-GI-NEXT:    ret
+    %3 = lshr <2 x i128> %0, %1
+    ret <2 x i128> %3
+}
+
 ; ===== Vector with Non-Pow 2 Width =====
 
 define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){

From 3a4b30e11eb8a2015aac185cd2368f4dc3ed1e53 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Fri, 13 Sep 2024 18:44:26 +0100
Subject: [PATCH 40/43] [AArch64][GISel] Scalarize i128 ICmp and Select.

Similar to other i128 bit operations, we scalarizer any icmps or selects larger
than 64bits.
---
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   2 +
 llvm/test/CodeGen/AArch64/fcmp.ll             | 155 +++++++-----------
 llvm/test/CodeGen/AArch64/icmp.ll             |  58 ++++++-
 3 files changed, 116 insertions(+), 99 deletions(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 623e59c4be805..3957d21ea695b 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -543,6 +543,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s32)
+      .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .minScalarEltSameAsIf(
           [=](const LegalityQuery &Query) {
             const LLT &Ty = Query.Types[0];
@@ -785,6 +786,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, s32, s64)
       .clampScalar(1, s32, s32)
+      .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .minScalarEltSameAsIf(all(isVector(0), isVector(1)), 1, 0)
       .lowerIf(isVector(0));
 
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 5e44da5fcfa2d..baab53d8bdbd4 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -465,49 +465,33 @@ define <2 x fp128> @v2f128_fp128(<2 x fp128> %a, <2 x fp128> %b, <2 x fp128> %d,
 ; CHECK-GI-NEXT:    .cfi_offset w30, -16
 ; CHECK-GI-NEXT:    stp q3, q1, [sp] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    mov v1.16b, v2.16b
-; CHECK-GI-NEXT:    stp q4, q5, [sp, #32] // 32-byte Folded Spill
-; CHECK-GI-NEXT:    stp q6, q7, [sp, #64] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    stp q6, q4, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    stp q7, q5, [sp, #64] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    ldr q0, [sp, #32] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    bfi x19, x8, #32, #32
-; CHECK-GI-NEXT:    cset w8, lt
-; CHECK-GI-NEXT:    fmov x10, d0
-; CHECK-GI-NEXT:    mov x11, v0.d[1]
-; CHECK-GI-NEXT:    bfi x8, x8, #32, #32
-; CHECK-GI-NEXT:    ldp q0, q1, [sp, #48] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    lsl x9, x19, #63
-; CHECK-GI-NEXT:    lsl x8, x8, #63
+; CHECK-GI-NEXT:    ldp q3, q2, [sp, #32] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w19, #0
 ; CHECK-GI-NEXT:    ldp x30, x19, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    asr x9, x9, #63
-; CHECK-GI-NEXT:    fmov x12, d0
-; CHECK-GI-NEXT:    mov x13, v0.d[1]
-; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov x14, d1
-; CHECK-GI-NEXT:    asr x8, x8, #63
-; CHECK-GI-NEXT:    and x10, x10, x9
-; CHECK-GI-NEXT:    fmov x15, d0
-; CHECK-GI-NEXT:    mov x16, v1.d[1]
-; CHECK-GI-NEXT:    mov x17, v0.d[1]
-; CHECK-GI-NEXT:    and x12, x12, x8
-; CHECK-GI-NEXT:    bic x14, x14, x9
-; CHECK-GI-NEXT:    bic x15, x15, x8
-; CHECK-GI-NEXT:    orr x10, x10, x14
-; CHECK-GI-NEXT:    orr x12, x12, x15
-; CHECK-GI-NEXT:    mov v0.d[0], x10
-; CHECK-GI-NEXT:    and x10, x11, x9
-; CHECK-GI-NEXT:    mov v1.d[0], x12
-; CHECK-GI-NEXT:    and x11, x13, x8
-; CHECK-GI-NEXT:    bic x9, x16, x9
-; CHECK-GI-NEXT:    bic x8, x17, x8
-; CHECK-GI-NEXT:    orr x9, x10, x9
-; CHECK-GI-NEXT:    orr x8, x11, x8
-; CHECK-GI-NEXT:    mov v0.d[1], x9
-; CHECK-GI-NEXT:    mov v1.d[1], x8
+; CHECK-GI-NEXT:    mov d0, v2.d[1]
+; CHECK-GI-NEXT:    mov d1, v3.d[1]
+; CHECK-GI-NEXT:    fcsel d2, d2, d3, lt
+; CHECK-GI-NEXT:    fmov x8, d2
+; CHECK-GI-NEXT:    fcsel d3, d0, d1, lt
+; CHECK-GI-NEXT:    ldp q5, q0, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    mov d1, v0.d[1]
+; CHECK-GI-NEXT:    mov d4, v5.d[1]
+; CHECK-GI-NEXT:    fcsel d0, d0, d5, lt
+; CHECK-GI-NEXT:    fmov x9, d0
+; CHECK-GI-NEXT:    mov v0.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d3
+; CHECK-GI-NEXT:    fcsel d2, d1, d4, lt
+; CHECK-GI-NEXT:    mov v1.d[0], x9
+; CHECK-GI-NEXT:    fmov x9, d2
+; CHECK-GI-NEXT:    mov v0.d[1], x8
+; CHECK-GI-NEXT:    mov v1.d[1], x9
 ; CHECK-GI-NEXT:    add sp, sp, #112
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -567,77 +551,52 @@ define <3 x fp128> @v3f128_fp128(<3 x fp128> %a, <3 x fp128> %b, <3 x fp128> %d,
 ; CHECK-GI-NEXT:    mov v1.16b, v3.16b
 ; CHECK-GI-NEXT:    stp q5, q2, [sp, #32] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q2, [sp, #192]
-; CHECK-GI-NEXT:    str q7, [sp, #64] // 16-byte Folded Spill
-; CHECK-GI-NEXT:    stp q6, q2, [sp, #80] // 32-byte Folded Spill
+; CHECK-GI-NEXT:    str q2, [sp, #144] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q2, [sp, #208]
-; CHECK-GI-NEXT:    str q2, [sp, #112] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp q2, q6, [sp, #64] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q2, [sp, #224]
-; CHECK-GI-NEXT:    str q2, [sp, #128] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    stp q7, q2, [sp, #96] // 32-byte Folded Spill
 ; CHECK-GI-NEXT:    ldr q2, [sp, #240]
-; CHECK-GI-NEXT:    str q2, [sp, #144] // 16-byte Folded Spill
+; CHECK-GI-NEXT:    str q2, [sp, #128] // 16-byte Folded Spill
 ; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    cset w19, lt
+; CHECK-GI-NEXT:    mov w19, w0
 ; CHECK-GI-NEXT:    bl __lttf2
 ; CHECK-GI-NEXT:    ldp q1, q0, [sp, #32] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    cmp w0, #0
-; CHECK-GI-NEXT:    cset w20, lt
+; CHECK-GI-NEXT:    mov w20, w0
 ; CHECK-GI-NEXT:    bl __lttf2
-; CHECK-GI-NEXT:    ldr q0, [sp, #80] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    bfi x19, x8, #32, #32
-; CHECK-GI-NEXT:    bfi x20, x8, #32, #32
-; CHECK-GI-NEXT:    cmp w0, #0
+; CHECK-GI-NEXT:    ldp q5, q4, [sp, #64] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    cmp w19, #0
+; CHECK-GI-NEXT:    ldp q7, q6, [sp, #96] // 32-byte Folded Reload
 ; CHECK-GI-NEXT:    ldr x30, [sp, #160] // 8-byte Folded Reload
-; CHECK-GI-NEXT:    fmov x8, d0
-; CHECK-GI-NEXT:    mov x10, v0.d[1]
-; CHECK-GI-NEXT:    ldr q0, [sp, #64] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    cset w9, lt
-; CHECK-GI-NEXT:    lsl x13, x19, #63
-; CHECK-GI-NEXT:    lsl x14, x20, #63
-; CHECK-GI-NEXT:    fmov x11, d0
-; CHECK-GI-NEXT:    mov x12, v0.d[1]
-; CHECK-GI-NEXT:    ldr q0, [sp, #96] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    bfi x9, x8, #32, #32
-; CHECK-GI-NEXT:    asr x13, x13, #63
-; CHECK-GI-NEXT:    asr x14, x14, #63
-; CHECK-GI-NEXT:    fmov x15, d0
-; CHECK-GI-NEXT:    mov x16, v0.d[1]
-; CHECK-GI-NEXT:    ldp q0, q1, [sp, #112] // 32-byte Folded Reload
-; CHECK-GI-NEXT:    lsl x9, x9, #63
-; CHECK-GI-NEXT:    and x8, x8, x13
-; CHECK-GI-NEXT:    and x11, x11, x14
-; CHECK-GI-NEXT:    asr x9, x9, #63
+; CHECK-GI-NEXT:    mov d0, v4.d[1]
+; CHECK-GI-NEXT:    mov d1, v5.d[1]
+; CHECK-GI-NEXT:    fcsel d4, d4, d5, lt
+; CHECK-GI-NEXT:    mov d2, v7.d[1]
+; CHECK-GI-NEXT:    mov d3, v6.d[1]
+; CHECK-GI-NEXT:    fmov x8, d4
+; CHECK-GI-NEXT:    fcsel d5, d0, d1, lt
+; CHECK-GI-NEXT:    cmp w20, #0
+; CHECK-GI-NEXT:    fcsel d1, d7, d6, lt
+; CHECK-GI-NEXT:    ldp q7, q0, [sp, #128] // 32-byte Folded Reload
+; CHECK-GI-NEXT:    fcsel d3, d2, d3, lt
+; CHECK-GI-NEXT:    cmp w0, #0
 ; CHECK-GI-NEXT:    ldp x20, x19, [sp, #176] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov x17, d0
-; CHECK-GI-NEXT:    mov x18, v0.d[1]
-; CHECK-GI-NEXT:    ldr q0, [sp, #144] // 16-byte Folded Reload
-; CHECK-GI-NEXT:    fmov x0, d1
-; CHECK-GI-NEXT:    and x15, x15, x9
-; CHECK-GI-NEXT:    mov x2, v1.d[1]
-; CHECK-GI-NEXT:    fmov x1, d0
-; CHECK-GI-NEXT:    mov x3, v0.d[1]
-; CHECK-GI-NEXT:    bic x17, x17, x13
-; CHECK-GI-NEXT:    bic x0, x0, x14
-; CHECK-GI-NEXT:    orr x8, x8, x17
-; CHECK-GI-NEXT:    bic x1, x1, x9
-; CHECK-GI-NEXT:    orr x11, x11, x0
+; CHECK-GI-NEXT:    mov d2, v0.d[1]
+; CHECK-GI-NEXT:    mov d6, v7.d[1]
+; CHECK-GI-NEXT:    fcsel d7, d0, d7, lt
 ; CHECK-GI-NEXT:    mov v0.d[0], x8
-; CHECK-GI-NEXT:    orr x15, x15, x1
-; CHECK-GI-NEXT:    mov v1.d[0], x11
-; CHECK-GI-NEXT:    and x8, x10, x13
-; CHECK-GI-NEXT:    mov v2.d[0], x15
-; CHECK-GI-NEXT:    and x10, x12, x14
-; CHECK-GI-NEXT:    and x11, x16, x9
-; CHECK-GI-NEXT:    bic x12, x18, x13
-; CHECK-GI-NEXT:    bic x13, x2, x14
-; CHECK-GI-NEXT:    bic x9, x3, x9
-; CHECK-GI-NEXT:    orr x8, x8, x12
-; CHECK-GI-NEXT:    orr x10, x10, x13
-; CHECK-GI-NEXT:    orr x9, x11, x9
+; CHECK-GI-NEXT:    fmov x8, d1
+; CHECK-GI-NEXT:    fmov x9, d7
+; CHECK-GI-NEXT:    fcsel d4, d2, d6, lt
+; CHECK-GI-NEXT:    mov v1.d[0], x8
+; CHECK-GI-NEXT:    fmov x8, d5
+; CHECK-GI-NEXT:    mov v2.d[0], x9
+; CHECK-GI-NEXT:    fmov x9, d3
+; CHECK-GI-NEXT:    fmov x10, d4
 ; CHECK-GI-NEXT:    mov v0.d[1], x8
-; CHECK-GI-NEXT:    mov v1.d[1], x10
-; CHECK-GI-NEXT:    mov v2.d[1], x9
+; CHECK-GI-NEXT:    mov v1.d[1], x9
+; CHECK-GI-NEXT:    mov v2.d[1], x10
 ; CHECK-GI-NEXT:    add sp, sp, #192
 ; CHECK-GI-NEXT:    ret
 entry:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index b00e5d6c701d8..61964060ca2c8 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
-; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+; RUN: llc -mtriple=aarch64 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
 ; CHECK-LABEL: i64_i64:
@@ -1376,6 +1376,62 @@ entry:
   ret <32 x i8> %s
 }
 
+define <2 x i128> @v2i128_i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %d, <2 x i128> %e) {
+; CHECK-SD-LABEL: v2i128_i128:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    add x10, sp, #32
+; CHECK-SD-NEXT:    mov x11, sp
+; CHECK-SD-NEXT:    cmp x0, x4
+; CHECK-SD-NEXT:    orr x12, x10, #0x8
+; CHECK-SD-NEXT:    orr x13, x11, #0x8
+; CHECK-SD-NEXT:    sbcs xzr, x1, x5
+; CHECK-SD-NEXT:    add x8, sp, #48
+; CHECK-SD-NEXT:    add x9, sp, #16
+; CHECK-SD-NEXT:    csel x12, x13, x12, lt
+; CHECK-SD-NEXT:    csel x10, x11, x10, lt
+; CHECK-SD-NEXT:    cmp x2, x6
+; CHECK-SD-NEXT:    orr x11, x8, #0x8
+; CHECK-SD-NEXT:    orr x13, x9, #0x8
+; CHECK-SD-NEXT:    sbcs xzr, x3, x7
+; CHECK-SD-NEXT:    ldr x0, [x10]
+; CHECK-SD-NEXT:    csel x8, x9, x8, lt
+; CHECK-SD-NEXT:    csel x9, x13, x11, lt
+; CHECK-SD-NEXT:    ldr x1, [x12]
+; CHECK-SD-NEXT:    ldr x2, [x8]
+; CHECK-SD-NEXT:    ldr x3, [x9]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v2i128_i128:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    cmp x1, x5
+; CHECK-GI-NEXT:    ldp x8, x9, [sp]
+; CHECK-GI-NEXT:    cset w10, lt
+; CHECK-GI-NEXT:    cmp x0, x4
+; CHECK-GI-NEXT:    cset w13, lo
+; CHECK-GI-NEXT:    cmp x1, x5
+; CHECK-GI-NEXT:    csel w10, w13, w10, eq
+; CHECK-GI-NEXT:    cmp x3, x7
+; CHECK-GI-NEXT:    ldp x13, x14, [sp, #32]
+; CHECK-GI-NEXT:    cset w15, lt
+; CHECK-GI-NEXT:    cmp x2, x6
+; CHECK-GI-NEXT:    ldp x11, x12, [sp, #16]
+; CHECK-GI-NEXT:    cset w16, lo
+; CHECK-GI-NEXT:    cmp x3, x7
+; CHECK-GI-NEXT:    ldp x17, x18, [sp, #48]
+; CHECK-GI-NEXT:    csel w15, w16, w15, eq
+; CHECK-GI-NEXT:    tst w10, #0x1
+; CHECK-GI-NEXT:    csel x0, x8, x13, ne
+; CHECK-GI-NEXT:    csel x1, x9, x14, ne
+; CHECK-GI-NEXT:    tst w15, #0x1
+; CHECK-GI-NEXT:    csel x2, x11, x17, ne
+; CHECK-GI-NEXT:    csel x3, x12, x18, ne
+; CHECK-GI-NEXT:    ret
+entry:
+  %c = icmp slt <2 x i128> %a, %b
+  %s = select <2 x i1> %c, <2 x i128> %d, <2 x i128> %e
+  ret <2 x i128> %s
+}
+
 ; ===== ICMP Zero RHS =====
 
 define <8 x i1> @icmp_eq_v8i8_Zero_RHS(<8 x i8> %a) {

From d4f41befb7256f8e8378ae358b2b3d802454d6a4 Mon Sep 17 00:00:00 2001
From: "Henrik G. Olsson" <hnrklssn@gmail.com>
Date: Fri, 13 Sep 2024 10:47:34 -0700
Subject: [PATCH 41/43] [Utils] add update-verify-tests.py (#97369)

Adds a python script to automatically take output from a failed clang
-verify test and update the test case(s) to expect the new behaviour.
---
 .../Inputs/duplicate-diag.c                   |   8 +
 .../Inputs/duplicate-diag.c.expected          |   8 +
 .../Inputs/infer-indentation.c                |   8 +
 .../Inputs/infer-indentation.c.expected       |  11 +
 .../Inputs/leave-existing-diags.c             |  11 +
 .../Inputs/leave-existing-diags.c.expected    |  12 +
 .../Inputs/multiple-errors.c                  |   6 +
 .../Inputs/multiple-errors.c.expected         |   9 +
 .../multiple-missing-errors-same-line.c       |   8 +
 ...ltiple-missing-errors-same-line.c.expected |  13 +
 .../update-verify-tests/Inputs/no-checks.c    |   3 +
 .../Inputs/no-checks.c.expected               |   4 +
 .../update-verify-tests/Inputs/no-diags.c     |   5 +
 .../Inputs/no-diags.c.expected                |   5 +
 .../Inputs/no-expected-diags.c                |   4 +
 .../Inputs/no-expected-diags.c.expected       |   4 +
 .../Inputs/non-default-prefix.c               |   5 +
 .../Inputs/non-default-prefix.c.expected      |   5 +
 .../Inputs/update-same-line.c                 |   4 +
 .../Inputs/update-same-line.c.expected        |   4 +
 .../Inputs/update-single-check.c              |   4 +
 .../Inputs/update-single-check.c.expected     |   4 +
 .../update-verify-tests/duplicate-diag.test   |   4 +
 .../infer-indentation.test                    |   3 +
 .../leave-existing-diags.test                 |   4 +
 .../utils/update-verify-tests/lit.local.cfg   |  25 +
 .../update-verify-tests/multiple-errors.test  |   3 +
 .../multiple-missing-errors-same-line.test    |   3 +
 .../utils/update-verify-tests/no-checks.test  |   3 +
 .../utils/update-verify-tests/no-diags.test   |   4 +
 .../no-expected-diags.test                    |   4 +
 .../non-default-prefix.test                   |   4 +
 .../update-verify-tests/update-same-line.test |   4 +
 .../update-single-check.test                  |   3 +
 clang/utils/UpdateVerifyTests/core.py         | 452 ++++++++++++++++++
 clang/utils/update-verify-tests.py            |  38 ++
 36 files changed, 699 insertions(+)
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/infer-indentation.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-errors.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-checks.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-diags.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/update-same-line.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/update-single-check.c
 create mode 100644 clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected
 create mode 100644 clang/test/utils/update-verify-tests/duplicate-diag.test
 create mode 100644 clang/test/utils/update-verify-tests/infer-indentation.test
 create mode 100644 clang/test/utils/update-verify-tests/leave-existing-diags.test
 create mode 100644 clang/test/utils/update-verify-tests/lit.local.cfg
 create mode 100644 clang/test/utils/update-verify-tests/multiple-errors.test
 create mode 100644 clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test
 create mode 100644 clang/test/utils/update-verify-tests/no-checks.test
 create mode 100644 clang/test/utils/update-verify-tests/no-diags.test
 create mode 100644 clang/test/utils/update-verify-tests/no-expected-diags.test
 create mode 100644 clang/test/utils/update-verify-tests/non-default-prefix.test
 create mode 100644 clang/test/utils/update-verify-tests/update-same-line.test
 create mode 100644 clang/test/utils/update-verify-tests/update-single-check.test
 create mode 100644 clang/utils/UpdateVerifyTests/core.py
 create mode 100644 clang/utils/update-verify-tests.py

diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c
new file mode 100644
index 0000000000000..8c7e46c6eca9c
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c
@@ -0,0 +1,8 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2; a = 2;
+    b = 2; b = 2;
+    // expected-error@+1 3{{use of undeclared identifier 'c'}}
+    c = 2; c = 2;
+    // expected-error 2{{asdf}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected
new file mode 100644
index 0000000000000..6214ff382f449
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/duplicate-diag.c.expected
@@ -0,0 +1,8 @@
+void foo() {
+    // expected-error@+1 2{{use of undeclared identifier 'a'}}
+    a = 2; a = 2;
+    // expected-error@+1 2{{use of undeclared identifier 'b'}}
+    b = 2; b = 2;
+    // expected-error@+1 2{{use of undeclared identifier 'c'}}
+    c = 2; c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c
new file mode 100644
index 0000000000000..0210ac35fd5cd
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c
@@ -0,0 +1,8 @@
+void foo() {
+         //     expected-error@+1    2      {{use of undeclared identifier 'a'}}
+    a = 2; a = 2; b = 2; b = 2; c = 2;
+         //     expected-error@+1    2      {{asdf}}
+    d = 2;
+    e = 2; f = 2;                 //     expected-error    2      {{use of undeclared identifier 'e'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected
new file mode 100644
index 0000000000000..5c5aaeeef97ac
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/infer-indentation.c.expected
@@ -0,0 +1,11 @@
+void foo() {
+         //     expected-error@+3          {{use of undeclared identifier 'c'}}
+         //     expected-error@+2    2      {{use of undeclared identifier 'b'}}
+         //     expected-error@+1    2      {{use of undeclared identifier 'a'}}
+    a = 2; a = 2; b = 2; b = 2; c = 2;
+         //     expected-error@+1          {{use of undeclared identifier 'd'}}
+    d = 2;
+    //     expected-error@+1          {{use of undeclared identifier 'f'}}
+    e = 2; f = 2;                 //     expected-error          {{use of undeclared identifier 'e'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c
new file mode 100644
index 0000000000000..1aa8d088e9727
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c
@@ -0,0 +1,11 @@
+void foo() {
+    a = 2;
+    // expected-error@-1{{use of undeclared identifier 'a'}}
+    b = 2;// expected-error{{use of undeclared identifier 'b'}}
+    c = 2;
+    // expected-error@5{{use of undeclared identifier 'c'}}
+    d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}}
+
+    e = 2; // error to trigger mismatch
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected
new file mode 100644
index 0000000000000..6b621061bbfbb
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/leave-existing-diags.c.expected
@@ -0,0 +1,12 @@
+void foo() {
+    a = 2;
+    // expected-error@-1{{use of undeclared identifier 'a'}}
+    b = 2;// expected-error{{use of undeclared identifier 'b'}}
+    c = 2;
+    // expected-error@5{{use of undeclared identifier 'c'}}
+    d = 2; // expected-error-re{{use of {{.*}} identifier 'd'}}
+
+    // expected-error@+1{{use of undeclared identifier 'e'}}
+    e = 2; // error to trigger mismatch
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c
new file mode 100644
index 0000000000000..e230e0a337bf4
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c
@@ -0,0 +1,6 @@
+void foo() {
+    a = 2;
+    b = 2;
+
+    c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected
new file mode 100644
index 0000000000000..27dc1f30a26fa
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-errors.c.expected
@@ -0,0 +1,9 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2;
+    // expected-error@+1{{use of undeclared identifier 'b'}}
+    b = 2;
+
+    // expected-error@+1{{use of undeclared identifier 'c'}}
+    c = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c
new file mode 100644
index 0000000000000..03f723d44bbe8
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c
@@ -0,0 +1,8 @@
+void foo() {
+    a = 2; b = 2; c = 2;
+}
+
+void bar() {
+    x = 2; y = 2; z = 2;
+    // expected-error@-1{{use of undeclared identifier 'x'}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected
new file mode 100644
index 0000000000000..24b57f4353d95
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/multiple-missing-errors-same-line.c.expected
@@ -0,0 +1,13 @@
+void foo() {
+    // expected-error@+3{{use of undeclared identifier 'c'}}
+    // expected-error@+2{{use of undeclared identifier 'b'}}
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2; b = 2; c = 2;
+}
+
+void bar() {
+    x = 2; y = 2; z = 2;
+    // expected-error@-1{{use of undeclared identifier 'x'}}
+    // expected-error@-2{{use of undeclared identifier 'y'}}
+    // expected-error@-3{{use of undeclared identifier 'z'}}
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c b/clang/test/utils/update-verify-tests/Inputs/no-checks.c
new file mode 100644
index 0000000000000..8fd1f7cd33370
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-checks.c
@@ -0,0 +1,3 @@
+void foo() {
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected
new file mode 100644
index 0000000000000..e80548fbe50f2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-checks.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'bar'}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-diags.c
new file mode 100644
index 0000000000000..66d169be43940
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-diags.c
@@ -0,0 +1,5 @@
+void foo() {
+    // expected-error@+1{{asdf}}
+    int a = 2;
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected
new file mode 100644
index 0000000000000..0523028494570
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-diags.c.expected
@@ -0,0 +1,5 @@
+// expected-no-diagnostics
+void foo() {
+    int a = 2;
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c
new file mode 100644
index 0000000000000..78b72e1357da7
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c
@@ -0,0 +1,4 @@
+// expected-no-diagnostics
+void foo() {
+    a = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected
new file mode 100644
index 0000000000000..d948ffce56189
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/no-expected-diags.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'a'}}
+    a = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c
new file mode 100644
index 0000000000000..3d63eaf0f1b87
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c
@@ -0,0 +1,5 @@
+void foo() {
+    a = 2; // check-error{{asdf}}
+           // expected-error@-1{ignored}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected
new file mode 100644
index 0000000000000..a877f86922123
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/non-default-prefix.c.expected
@@ -0,0 +1,5 @@
+void foo() {
+    a = 2; // check-error{{use of undeclared identifier 'a'}}
+           // expected-error@-1{ignored}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c
new file mode 100644
index 0000000000000..5278ce0c57c31
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c
@@ -0,0 +1,4 @@
+void foo() {
+    bar = 2;     //   expected-error       {{asdf}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected
new file mode 100644
index 0000000000000..8ba47f788319b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-same-line.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    bar = 2;     //   expected-error       {{use of undeclared identifier 'bar'}}
+}
+
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c
new file mode 100644
index 0000000000000..20b011bfc3d77
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{asdf}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected
new file mode 100644
index 0000000000000..e80548fbe50f2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/Inputs/update-single-check.c.expected
@@ -0,0 +1,4 @@
+void foo() {
+    // expected-error@+1{{use of undeclared identifier 'bar'}}
+    bar = 2;
+}
diff --git a/clang/test/utils/update-verify-tests/duplicate-diag.test b/clang/test/utils/update-verify-tests/duplicate-diag.test
new file mode 100644
index 0000000000000..3163ce46199c3
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/duplicate-diag.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/duplicate-diag.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/duplicate-diag.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/infer-indentation.test b/clang/test/utils/update-verify-tests/infer-indentation.test
new file mode 100644
index 0000000000000..6ba2f5d9d505b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/infer-indentation.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/infer-indentation.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/infer-indentation.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/leave-existing-diags.test b/clang/test/utils/update-verify-tests/leave-existing-diags.test
new file mode 100644
index 0000000000000..cde690ef715a6
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/leave-existing-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/leave-existing-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/leave-existing-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/lit.local.cfg b/clang/test/utils/update-verify-tests/lit.local.cfg
new file mode 100644
index 0000000000000..a0b6afccc2501
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/lit.local.cfg
@@ -0,0 +1,25 @@
+import lit.util
+
+# python 2.7 backwards compatibility
+try:
+    from shlex import quote as shell_quote
+except ImportError:
+    from pipes import quote as shell_quote
+
+if config.standalone_build:
+    # These tests require the update-verify-tests.py script from the clang
+    # source tree, so skip these tests if we are doing standalone builds.
+    config.unsupported = True
+else:
+    config.suffixes = [".test"]
+
+    script_path = os.path.join(
+        config.clang_src_dir, "utils", "update-verify-tests.py"
+    )
+    python = shell_quote(config.python_executable)
+    config.substitutions.append(
+        (
+            "%update-verify-tests",
+            "%s %s" % (python, shell_quote(script_path)),
+        )
+    )
diff --git a/clang/test/utils/update-verify-tests/multiple-errors.test b/clang/test/utils/update-verify-tests/multiple-errors.test
new file mode 100644
index 0000000000000..1332ef365dc86
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/multiple-errors.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/multiple-errors.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/multiple-errors.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test b/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test
new file mode 100644
index 0000000000000..a9c21cd77e192
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/multiple-missing-errors-same-line.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/multiple-missing-errors-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/multiple-missing-errors-same-line.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/no-checks.test b/clang/test/utils/update-verify-tests/no-checks.test
new file mode 100644
index 0000000000000..f6ea91fa552be
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-checks.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/no-checks.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-checks.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/test/utils/update-verify-tests/no-diags.test b/clang/test/utils/update-verify-tests/no-diags.test
new file mode 100644
index 0000000000000..464fe8894253b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/no-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/no-expected-diags.test b/clang/test/utils/update-verify-tests/no-expected-diags.test
new file mode 100644
index 0000000000000..75235f17a64a2
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/no-expected-diags.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/no-expected-diags.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/no-expected-diags.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/non-default-prefix.test b/clang/test/utils/update-verify-tests/non-default-prefix.test
new file mode 100644
index 0000000000000..e581755a6e603
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/non-default-prefix.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/non-default-prefix.c %t.c && not %clang_cc1 -verify=check %t.c 2>&1 | %update-verify-tests --prefix check
+# RUN: diff -u %S/Inputs/non-default-prefix.c.expected %t.c
+# RUN: %clang_cc1 -verify=check %t.c
+
diff --git a/clang/test/utils/update-verify-tests/update-same-line.test b/clang/test/utils/update-verify-tests/update-same-line.test
new file mode 100644
index 0000000000000..324768eae5faa
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/update-same-line.test
@@ -0,0 +1,4 @@
+# RUN: cp %S/Inputs/update-same-line.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/update-same-line.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
+
diff --git a/clang/test/utils/update-verify-tests/update-single-check.test b/clang/test/utils/update-verify-tests/update-single-check.test
new file mode 100644
index 0000000000000..2cb1ae3bcbd3b
--- /dev/null
+++ b/clang/test/utils/update-verify-tests/update-single-check.test
@@ -0,0 +1,3 @@
+# RUN: cp %S/Inputs/update-single-check.c %t.c && not %clang_cc1 -verify %t.c 2>&1 | %update-verify-tests
+# RUN: diff -u %S/Inputs/update-single-check.c.expected %t.c
+# RUN: %clang_cc1 -verify %t.c
diff --git a/clang/utils/UpdateVerifyTests/core.py b/clang/utils/UpdateVerifyTests/core.py
new file mode 100644
index 0000000000000..d1350cdbb698b
--- /dev/null
+++ b/clang/utils/UpdateVerifyTests/core.py
@@ -0,0 +1,452 @@
+import sys
+import re
+
+DEBUG = False
+
+
+def dprint(*args):
+    if DEBUG:
+        print(*args, file=sys.stderr)
+
+
+class KnownException(Exception):
+    pass
+
+
+def parse_error_category(s, prefix):
+    if "no expected directives found" in s:
+        return None
+    parts = s.split("diagnostics")
+    diag_category = parts[0]
+    category_parts = parts[0].strip().strip("'").split("-")
+    expected = category_parts[0]
+    if expected != prefix:
+        raise Exception(
+            f"expected prefix '{prefix}', but found '{expected}'. Multiple verify prefixes are not supported."
+        )
+    diag_category = category_parts[1]
+    if "seen but not expected" in parts[1]:
+        seen = True
+    elif "expected but not seen" in parts[1]:
+        seen = False
+    else:
+        raise KnownException(f"unexpected category '{parts[1]}'")
+    return (diag_category, seen)
+
+
+diag_error_re = re.compile(r"File (\S+) Line (\d+): (.+)")
+diag_error_re2 = re.compile(r"File \S+ Line \d+ \(directive at (\S+):(\d+)\): (.+)")
+
+
+def parse_diag_error(s):
+    m = diag_error_re2.match(s)
+    if not m:
+        m = diag_error_re.match(s)
+    if not m:
+        return None
+    return (m.group(1), int(m.group(2)), m.group(3))
+
+
+class Line:
+    def __init__(self, content, line_n):
+        self.content = content
+        self.diag = None
+        self.line_n = line_n
+        self.targeting_diags = []
+
+    def update_line_n(self, n):
+        self.line_n = n
+
+    def render(self):
+        if not self.diag:
+            return self.content
+        assert "{{DIAG}}" in self.content
+        res = self.content.replace("{{DIAG}}", self.diag.render())
+        if not res.strip():
+            return ""
+        return res
+
+
+class Diag:
+    def __init__(
+        self,
+        prefix,
+        diag_content,
+        category,
+        parsed_target_line_n,
+        line_is_absolute,
+        count,
+        line,
+        is_re,
+        whitespace_strings,
+        is_from_source_file,
+    ):
+        self.prefix = prefix
+        self.diag_content = diag_content
+        self.category = category
+        self.parsed_target_line_n = parsed_target_line_n
+        self.line_is_absolute = line_is_absolute
+        self.count = count
+        self.line = line
+        self.target = None
+        self.is_re = is_re
+        self.absolute_target()
+        self.whitespace_strings = whitespace_strings
+        self.is_from_source_file = is_from_source_file
+
+    def decrement_count(self):
+        self.count -= 1
+        assert self.count >= 0
+
+    def increment_count(self):
+        assert self.count >= 0
+        self.count += 1
+
+    def unset_target(self):
+        assert self.target is not None
+        self.target.targeting_diags.remove(self)
+        self.target = None
+
+    def set_target(self, target):
+        if self.target:
+            self.unset_target()
+        self.target = target
+        self.target.targeting_diags.append(self)
+
+    def absolute_target(self):
+        if self.target:
+            return self.target.line_n
+        if self.line_is_absolute:
+            return self.parsed_target_line_n
+        return self.line.line_n + self.parsed_target_line_n
+
+    def relative_target(self):
+        return self.absolute_target() - self.line.line_n
+
+    def take(self, other_diag):
+        assert self.count == 0
+        assert other_diag.count > 0
+        assert other_diag.target == self.target
+        assert not other_diag.line_is_absolute
+        assert not other_diag.is_re and not self.is_re
+        self.line_is_absolute = False
+        self.diag_content = other_diag.diag_content
+        self.count = other_diag.count
+        self.category = other_diag.category
+        self.count = other_diag.count
+        other_diag.count = 0
+
+    def render(self):
+        assert self.count >= 0
+        if self.count == 0:
+            return ""
+        line_location_s = ""
+        if self.relative_target() != 0:
+            if self.line_is_absolute:
+                line_location_s = f"@{self.absolute_target()}"
+            elif self.relative_target() > 0:
+                line_location_s = f"@+{self.relative_target()}"
+            else:
+                line_location_s = (
+                    f"@{self.relative_target()}"  # the minus sign is implicit
+                )
+        count_s = "" if self.count == 1 else f"{self.count}"
+        re_s = "-re" if self.is_re else ""
+        if self.whitespace_strings:
+            whitespace1_s = self.whitespace_strings[0]
+            whitespace2_s = self.whitespace_strings[1]
+            whitespace3_s = self.whitespace_strings[2]
+        else:
+            whitespace1_s = " "
+            whitespace2_s = ""
+            whitespace3_s = ""
+        if count_s and not whitespace2_s:
+            whitespace2_s = " "  # required to parse correctly
+        elif not count_s and whitespace2_s == " ":
+            """Don't emit a weird extra space.
+            However if the whitespace is something other than the
+            standard single space, let it be to avoid disrupting manual formatting.
+            The existence of a non-empty whitespace2_s implies this was parsed with
+            a count > 1 and then decremented, otherwise this whitespace would have
+            been parsed as whitespace3_s.
+            """
+            whitespace2_s = ""
+        return f"//{whitespace1_s}{self.prefix}-{self.category}{re_s}{line_location_s}{whitespace2_s}{count_s}{whitespace3_s}{{{{{self.diag_content}}}}}"
+
+
+expected_diag_re = re.compile(
+    r"//(\s*)([a-zA-Z]+)-(note|warning|error)(-re)?(@[+-]?\d+)?(?:(\s*)(\d+))?(\s*)\{\{(.*)\}\}"
+)
+
+
+def parse_diag(line, filename, lines, prefix):
+    s = line.content
+    ms = expected_diag_re.findall(s)
+    if not ms:
+        return None
+    if len(ms) > 1:
+        raise KnownException(
+            f"multiple diags on line {filename}:{line.line_n}. Aborting due to missing implementation."
+        )
+    [
+        whitespace1_s,
+        check_prefix,
+        category_s,
+        re_s,
+        target_line_s,
+        whitespace2_s,
+        count_s,
+        whitespace3_s,
+        diag_s,
+    ] = ms[0]
+    if check_prefix != prefix:
+        return None
+    if not target_line_s:
+        target_line_n = 0
+        is_absolute = False
+    elif target_line_s.startswith("@+"):
+        target_line_n = int(target_line_s[2:])
+        is_absolute = False
+    elif target_line_s.startswith("@-"):
+        target_line_n = int(target_line_s[1:])
+        is_absolute = False
+    else:
+        target_line_n = int(target_line_s[1:])
+        is_absolute = True
+    count = int(count_s) if count_s else 1
+    line.content = expected_diag_re.sub("{{DIAG}}", s)
+
+    return Diag(
+        prefix,
+        diag_s,
+        category_s,
+        target_line_n,
+        is_absolute,
+        count,
+        line,
+        bool(re_s),
+        [whitespace1_s, whitespace2_s, whitespace3_s],
+        True,
+    )
+
+
+def add_line(new_line, lines):
+    lines.insert(new_line.line_n - 1, new_line)
+    for i in range(new_line.line_n, len(lines)):
+        line = lines[i]
+        assert line.line_n == i
+        line.update_line_n(i + 1)
+    assert all(line.line_n == i + 1 for i, line in enumerate(lines))
+
+
+def remove_line(old_line, lines):
+    lines.remove(old_line)
+    for i in range(old_line.line_n - 1, len(lines)):
+        line = lines[i]
+        assert line.line_n == i + 2
+        line.update_line_n(i + 1)
+    assert all(line.line_n == i + 1 for i, line in enumerate(lines))
+
+
+indent_re = re.compile(r"\s*")
+
+
+def get_indent(s):
+    return indent_re.match(s).group(0)
+
+
+def orig_line_n_to_new_line_n(line_n, orig_lines):
+    return orig_lines[line_n - 1].line_n
+
+
+def add_diag(orig_line_n, diag_s, diag_category, lines, orig_lines, prefix):
+    line_n = orig_line_n_to_new_line_n(orig_line_n, orig_lines)
+    target = lines[line_n - 1]
+    for other in target.targeting_diags:
+        if other.is_re:
+            raise KnownException(
+                "mismatching diag on line with regex matcher. Skipping due to missing implementation"
+            )
+    reverse = (
+        True
+        if [other for other in target.targeting_diags if other.relative_target() < 0]
+        else False
+    )
+
+    targeting = [
+        other for other in target.targeting_diags if not other.line_is_absolute
+    ]
+    targeting.sort(reverse=reverse, key=lambda d: d.relative_target())
+    prev_offset = 0
+    prev_line = target
+    direction = -1 if reverse else 1
+    for d in targeting:
+        if d.relative_target() != prev_offset + direction:
+            break
+        prev_offset = d.relative_target()
+        prev_line = d.line
+    total_offset = prev_offset - 1 if reverse else prev_offset + 1
+    if reverse:
+        new_line_n = prev_line.line_n + 1
+    else:
+        new_line_n = prev_line.line_n
+    assert new_line_n == line_n + (not reverse) - total_offset
+
+    new_line = Line(get_indent(prev_line.content) + "{{DIAG}}\n", new_line_n)
+    add_line(new_line, lines)
+
+    whitespace_strings = prev_line.diag.whitespace_strings if prev_line.diag else None
+    new_diag = Diag(
+        prefix,
+        diag_s,
+        diag_category,
+        total_offset,
+        False,
+        1,
+        new_line,
+        False,
+        whitespace_strings,
+        False,
+    )
+    new_line.diag = new_diag
+    new_diag.set_target(target)
+
+
+def remove_dead_diags(lines):
+    for line in lines:
+        if not line.diag or line.diag.count != 0:
+            continue
+        if line.render() == "":
+            remove_line(line, lines)
+        else:
+            assert line.diag.is_from_source_file
+            for other_diag in line.targeting_diags:
+                if (
+                    other_diag.is_from_source_file
+                    or other_diag.count == 0
+                    or other_diag.category != line.diag.category
+                ):
+                    continue
+                if other_diag.is_re or line.diag.is_re:
+                    continue
+                line.diag.take(other_diag)
+                remove_line(other_diag.line, lines)
+
+
+def has_live_diags(lines):
+    for line in lines:
+        if line.diag and line.diag.count > 0:
+            return True
+    return False
+
+
+def get_expected_no_diags_line_n(lines, prefix):
+    for line in lines:
+        if f"{prefix}-no-diagnostics" in line.content:
+            return line.line_n
+    return None
+
+
+def update_test_file(filename, diag_errors, prefix, updated_test_files):
+    dprint(f"updating test file {filename}")
+    if filename in updated_test_files:
+        raise KnownException(f"{filename} already updated, but got new output")
+    else:
+        updated_test_files.add(filename)
+    with open(filename, "r") as f:
+        lines = [Line(line, i + 1) for i, line in enumerate(f.readlines())]
+    orig_lines = list(lines)
+    expected_no_diags_line_n = get_expected_no_diags_line_n(orig_lines, prefix)
+
+    for line in lines:
+        diag = parse_diag(line, filename, lines, prefix)
+        if diag:
+            line.diag = diag
+            diag.set_target(lines[diag.absolute_target() - 1])
+
+    for line_n, diag_s, diag_category, seen in diag_errors:
+        if seen:
+            continue
+        # this is a diagnostic expected but not seen
+        assert lines[line_n - 1].diag
+        if diag_s != lines[line_n - 1].diag.diag_content:
+            raise KnownException(
+                f"{filename}:{line_n} - found diag {lines[line_n - 1].diag.diag_content} but expected {diag_s}"
+            )
+        if diag_category != lines[line_n - 1].diag.category:
+            raise KnownException(
+                f"{filename}:{line_n} - found {lines[line_n - 1].diag.category} diag but expected {diag_category}"
+            )
+        lines[line_n - 1].diag.decrement_count()
+    diag_errors_left = []
+    diag_errors.sort(reverse=True, key=lambda t: t[0])
+    for line_n, diag_s, diag_category, seen in diag_errors:
+        if not seen:
+            continue
+        target = orig_lines[line_n - 1]
+        other_diags = [
+            d
+            for d in target.targeting_diags
+            if d.diag_content == diag_s and d.category == diag_category
+        ]
+        other_diag = other_diags[0] if other_diags else None
+        if other_diag:
+            other_diag.increment_count()
+        else:
+            add_diag(line_n, diag_s, diag_category, lines, orig_lines, prefix)
+    remove_dead_diags(lines)
+    has_diags = has_live_diags(lines)
+    with open(filename, "w") as f:
+        if not has_diags and expected_no_diags_line_n is None:
+            f.write("// expected-no-diagnostics\n")
+        for line in lines:
+            if has_diags and line.line_n == expected_no_diags_line_n:
+                continue
+            f.write(line.render())
+
+
+def update_test_files(errors, prefix):
+    errors_by_file = {}
+    for (filename, line, diag_s), (diag_category, seen) in errors:
+        if filename not in errors_by_file:
+            errors_by_file[filename] = []
+        errors_by_file[filename].append((line, diag_s, diag_category, seen))
+    updated_test_files = set()
+    for filename, diag_errors in errors_by_file.items():
+        try:
+            update_test_file(filename, diag_errors, prefix, updated_test_files)
+        except KnownException as e:
+            return f"Error in update-verify-tests while updating {filename}: {e}"
+    updated_files = list(updated_test_files)
+    assert updated_files
+    if len(updated_files) == 1:
+        return f"updated file {updated_files[0]}"
+    updated_files_s = "\n\t".join(updated_files)
+    return "updated files:\n\t{updated_files_s}"
+
+
+def check_expectations(tool_output, prefix):
+    """
+    The entry point function.
+    Called by the stand-alone update-verify-tests.py as well as litplugin.py.
+    """
+    curr = []
+    curr_category = None
+    try:
+        for line in tool_output:
+            if line.startswith("error: "):
+                curr_category = parse_error_category(line[len("error: ") :], prefix)
+                continue
+
+            diag_error = parse_diag_error(line.strip())
+            if diag_error:
+                curr.append((diag_error, curr_category))
+            else:
+                dprint("no match")
+                dprint(line.strip())
+    except KnownException as e:
+        return f"Error in update-verify-tests while parsing tool output: {e}"
+    if curr:
+        return update_test_files(curr, prefix)
+    else:
+        return "no mismatching diagnostics found"
diff --git a/clang/utils/update-verify-tests.py b/clang/utils/update-verify-tests.py
new file mode 100644
index 0000000000000..e2874a8c049ef
--- /dev/null
+++ b/clang/utils/update-verify-tests.py
@@ -0,0 +1,38 @@
+import sys
+import argparse
+from UpdateVerifyTests.core import check_expectations
+
+"""
+ Pipe output from clang's -verify into this script to have the test case updated to expect the actual diagnostic output.
+ When inserting new expected-* checks it will place them on the line before the location of the diagnostic, with an @+1,
+ or @+N for some N if there are multiple diagnostics emitted on the same line. If the current checks are using @-N for
+ this line, the new check will follow that convention also.
+ Existing checks will be left untouched as much as possible, including their location and whitespace content, to minimize
+ diffs. If inaccurate their count will be updated, or the check removed entirely.
+
+ Missing features:
+  - multiple prefixes on the same line (-verify=my-prefix,my-other-prefix)
+  - multiple prefixes on separate RUN lines (RUN: -verify=my-prefix\nRUN: -verify my-other-prefix)
+  - regexes with expected-*-re: existing ones will be left untouched if accurate, but the script will abort if there are any
+    diagnostic mismatches on the same line.
+  - multiple checks targeting the same line are supported, but a line may only contain one check
+  - if multiple checks targeting the same line are failing the script is not guaranteed to produce a minimal diff
+
+Example usage:
+  clang -verify [file] | python3 update-verify-tests.py
+  clang -verify=check [file] | python3 update-verify-tests.py --prefix check
+"""
+
+
+def main():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument(
+        "--prefix", default="expected", help="The prefix passed to -verify"
+    )
+    args = parser.parse_args()
+    output = check_expectations(sys.stdin.readlines(), args.prefix)
+    print(output)
+
+
+if __name__ == "__main__":
+    main()

From 4c040c027575f3a30dc94bfab4c975567195bdc7 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Fri, 13 Sep 2024 14:11:30 -0400
Subject: [PATCH 42/43] [Coroutines] Move Shape to its own header (#108242)

* To create custom ABIs plugin libraries need access to CoroShape.
* As a step in enabling plugin libraries, move Shape into its own header
* The header will eventually be moved into include/llvm/Transforms/Coroutines

See RFC for more info:
https://discourse.llvm.org/t/rfc-abi-objects-for-coroutines/81057
---
 llvm/lib/Transforms/Coroutines/CoroEarly.cpp  |   1 +
 llvm/lib/Transforms/Coroutines/CoroInternal.h | 224 +---------------
 llvm/lib/Transforms/Coroutines/CoroShape.h    | 249 ++++++++++++++++++
 llvm/lib/Transforms/Coroutines/Coroutines.cpp |   1 +
 4 files changed, 252 insertions(+), 223 deletions(-)
 create mode 100644 llvm/lib/Transforms/Coroutines/CoroShape.h

diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index 13b6680264c87..5f8efd1a8f32e 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "CoroInternal.h"
+#include "CoroShape.h"
 #include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 891798f53b2d0..fcbd31878bdea 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -12,6 +12,7 @@
 #define LLVM_LIB_TRANSFORMS_COROUTINES_COROINTERNAL_H
 
 #include "CoroInstr.h"
+#include "CoroShape.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/IRBuilder.h"
 
@@ -58,229 +59,6 @@ struct LowererBase {
   CallInst *makeSubFnCall(Value *Arg, int Index, Instruction *InsertPt);
 };
 
-enum class ABI {
-  /// The "resume-switch" lowering, where there are separate resume and
-  /// destroy functions that are shared between all suspend points.  The
-  /// coroutine frame implicitly stores the resume and destroy functions,
-  /// the current index, and any promise value.
-  Switch,
-
-  /// The "returned-continuation" lowering, where each suspend point creates a
-  /// single continuation function that is used for both resuming and
-  /// destroying.  Does not support promises.
-  Retcon,
-
-  /// The "unique returned-continuation" lowering, where each suspend point
-  /// creates a single continuation function that is used for both resuming
-  /// and destroying.  Does not support promises.  The function is known to
-  /// suspend at most once during its execution, and the return value of
-  /// the continuation is void.
-  RetconOnce,
-
-  /// The "async continuation" lowering, where each suspend point creates a
-  /// single continuation function. The continuation function is available as an
-  /// intrinsic.
-  Async,
-};
-
-// Holds structural Coroutine Intrinsics for a particular function and other
-// values used during CoroSplit pass.
-struct LLVM_LIBRARY_VISIBILITY Shape {
-  CoroBeginInst *CoroBegin;
-  SmallVector<AnyCoroEndInst *, 4> CoroEnds;
-  SmallVector<CoroSizeInst *, 2> CoroSizes;
-  SmallVector<CoroAlignInst *, 2> CoroAligns;
-  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
-  SmallVector<CallInst*, 2> SwiftErrorOps;
-  SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
-  SmallVector<CallInst *, 2> SymmetricTransfers;
-
-  // Field indexes for special fields in the switch lowering.
-  struct SwitchFieldIndex {
-    enum {
-      Resume,
-      Destroy
-
-      // The promise field is always at a fixed offset from the start of
-      // frame given its type, but the index isn't a constant for all
-      // possible frames.
-
-      // The switch-index field isn't at a fixed offset or index, either;
-      // we just work it in where it fits best.
-    };
-  };
-
-  coro::ABI ABI;
-
-  StructType *FrameTy;
-  Align FrameAlign;
-  uint64_t FrameSize;
-  Value *FramePtr;
-  BasicBlock *AllocaSpillBlock;
-
-  /// This would only be true if optimization are enabled.
-  bool OptimizeFrame;
-
-  struct SwitchLoweringStorage {
-    SwitchInst *ResumeSwitch;
-    AllocaInst *PromiseAlloca;
-    BasicBlock *ResumeEntryBlock;
-    unsigned IndexField;
-    unsigned IndexAlign;
-    unsigned IndexOffset;
-    bool HasFinalSuspend;
-    bool HasUnwindCoroEnd;
-  };
-
-  struct RetconLoweringStorage {
-    Function *ResumePrototype;
-    Function *Alloc;
-    Function *Dealloc;
-    BasicBlock *ReturnBlock;
-    bool IsFrameInlineInStorage;
-  };
-
-  struct AsyncLoweringStorage {
-    Value *Context;
-    CallingConv::ID AsyncCC;
-    unsigned ContextArgNo;
-    uint64_t ContextHeaderSize;
-    uint64_t ContextAlignment;
-    uint64_t FrameOffset; // Start of the frame.
-    uint64_t ContextSize; // Includes frame size.
-    GlobalVariable *AsyncFuncPointer;
-
-    Align getContextAlignment() const { return Align(ContextAlignment); }
-  };
-
-  union {
-    SwitchLoweringStorage SwitchLowering;
-    RetconLoweringStorage RetconLowering;
-    AsyncLoweringStorage AsyncLowering;
-  };
-
-  CoroIdInst *getSwitchCoroId() const {
-    assert(ABI == coro::ABI::Switch);
-    return cast<CoroIdInst>(CoroBegin->getId());
-  }
-
-  AnyCoroIdRetconInst *getRetconCoroId() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
-  }
-
-  CoroIdAsyncInst *getAsyncCoroId() const {
-    assert(ABI == coro::ABI::Async);
-    return cast<CoroIdAsyncInst>(CoroBegin->getId());
-  }
-
-  unsigned getSwitchIndexField() const {
-    assert(ABI == coro::ABI::Switch);
-    assert(FrameTy && "frame type not assigned");
-    return SwitchLowering.IndexField;
-  }
-  IntegerType *getIndexType() const {
-    assert(ABI == coro::ABI::Switch);
-    assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
-  }
-  ConstantInt *getIndex(uint64_t Value) const {
-    return ConstantInt::get(getIndexType(), Value);
-  }
-
-  PointerType *getSwitchResumePointerType() const {
-    assert(ABI == coro::ABI::Switch);
-  assert(FrameTy && "frame type not assigned");
-  return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
-  }
-
-  FunctionType *getResumeFunctionType() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return FunctionType::get(Type::getVoidTy(FrameTy->getContext()),
-                               PointerType::getUnqual(FrameTy->getContext()),
-                               /*IsVarArg=*/false);
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return RetconLowering.ResumePrototype->getFunctionType();
-    case coro::ABI::Async:
-      // Not used. The function type depends on the active suspend.
-      return nullptr;
-    }
-
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
-  ArrayRef<Type*> getRetconResultTypes() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-    auto FTy = CoroBegin->getFunction()->getFunctionType();
-
-    // The safety of all this is checked by checkWFRetconPrototype.
-    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
-      return STy->elements().slice(1);
-    } else {
-      return ArrayRef<Type*>();
-    }
-  }
-
-  ArrayRef<Type*> getRetconResumeTypes() const {
-    assert(ABI == coro::ABI::Retcon ||
-           ABI == coro::ABI::RetconOnce);
-
-    // The safety of all this is checked by checkWFRetconPrototype.
-    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
-    return FTy->params().slice(1);
-  }
-
-  CallingConv::ID getResumeFunctionCC() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return CallingConv::Fast;
-
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return RetconLowering.ResumePrototype->getCallingConv();
-    case coro::ABI::Async:
-      return AsyncLowering.AsyncCC;
-    }
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
-  AllocaInst *getPromiseAlloca() const {
-    if (ABI == coro::ABI::Switch)
-      return SwitchLowering.PromiseAlloca;
-    return nullptr;
-  }
-
-  BasicBlock::iterator getInsertPtAfterFramePtr() const {
-    if (auto *I = dyn_cast<Instruction>(FramePtr)) {
-      BasicBlock::iterator It = std::next(I->getIterator());
-      It.setHeadBit(true); // Copy pre-RemoveDIs behaviour.
-      return It;
-    }
-    return cast<Argument>(FramePtr)->getParent()->getEntryBlock().begin();
-  }
-
-  /// Allocate memory according to the rules of the active lowering.
-  ///
-  /// \param CG - if non-null, will be updated for the new call
-  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
-
-  /// Deallocate memory according to the rules of the active lowering.
-  ///
-  /// \param CG - if non-null, will be updated for the new call
-  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
-
-  Shape() = default;
-  explicit Shape(Function &F, bool OptimizeFrame = false)
-      : OptimizeFrame(OptimizeFrame) {
-    buildFrom(F);
-  }
-  void buildFrom(Function &F);
-};
-
 bool defaultMaterializable(Instruction &V);
 void normalizeCoroutine(Function &F, coro::Shape &Shape,
                         TargetTransformInfo &TTI);
diff --git a/llvm/lib/Transforms/Coroutines/CoroShape.h b/llvm/lib/Transforms/Coroutines/CoroShape.h
new file mode 100644
index 0000000000000..f5798b63bf732
--- /dev/null
+++ b/llvm/lib/Transforms/Coroutines/CoroShape.h
@@ -0,0 +1,249 @@
+//===- CoroShape.h - Coroutine info for lowering --------------*- C++ -*---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file declares the shape info struct that is required by many coroutine
+// utility methods.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
+#define LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
+
+#include "CoroInstr.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class CallGraph;
+
+namespace coro {
+
+enum class ABI {
+  /// The "resume-switch" lowering, where there are separate resume and
+  /// destroy functions that are shared between all suspend points.  The
+  /// coroutine frame implicitly stores the resume and destroy functions,
+  /// the current index, and any promise value.
+  Switch,
+
+  /// The "returned-continuation" lowering, where each suspend point creates a
+  /// single continuation function that is used for both resuming and
+  /// destroying.  Does not support promises.
+  Retcon,
+
+  /// The "unique returned-continuation" lowering, where each suspend point
+  /// creates a single continuation function that is used for both resuming
+  /// and destroying.  Does not support promises.  The function is known to
+  /// suspend at most once during its execution, and the return value of
+  /// the continuation is void.
+  RetconOnce,
+
+  /// The "async continuation" lowering, where each suspend point creates a
+  /// single continuation function. The continuation function is available as an
+  /// intrinsic.
+  Async,
+};
+
+// Holds structural Coroutine Intrinsics for a particular function and other
+// values used during CoroSplit pass.
+struct LLVM_LIBRARY_VISIBILITY Shape {
+  CoroBeginInst *CoroBegin;
+  SmallVector<AnyCoroEndInst *, 4> CoroEnds;
+  SmallVector<CoroSizeInst *, 2> CoroSizes;
+  SmallVector<CoroAlignInst *, 2> CoroAligns;
+  SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
+  SmallVector<CallInst *, 2> SwiftErrorOps;
+  SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
+  SmallVector<CallInst *, 2> SymmetricTransfers;
+
+  // Field indexes for special fields in the switch lowering.
+  struct SwitchFieldIndex {
+    enum {
+      Resume,
+      Destroy
+
+      // The promise field is always at a fixed offset from the start of
+      // frame given its type, but the index isn't a constant for all
+      // possible frames.
+
+      // The switch-index field isn't at a fixed offset or index, either;
+      // we just work it in where it fits best.
+    };
+  };
+
+  coro::ABI ABI;
+
+  StructType *FrameTy;
+  Align FrameAlign;
+  uint64_t FrameSize;
+  Value *FramePtr;
+  BasicBlock *AllocaSpillBlock;
+
+  /// This would only be true if optimization are enabled.
+  bool OptimizeFrame;
+
+  struct SwitchLoweringStorage {
+    SwitchInst *ResumeSwitch;
+    AllocaInst *PromiseAlloca;
+    BasicBlock *ResumeEntryBlock;
+    unsigned IndexField;
+    unsigned IndexAlign;
+    unsigned IndexOffset;
+    bool HasFinalSuspend;
+    bool HasUnwindCoroEnd;
+  };
+
+  struct RetconLoweringStorage {
+    Function *ResumePrototype;
+    Function *Alloc;
+    Function *Dealloc;
+    BasicBlock *ReturnBlock;
+    bool IsFrameInlineInStorage;
+  };
+
+  struct AsyncLoweringStorage {
+    Value *Context;
+    CallingConv::ID AsyncCC;
+    unsigned ContextArgNo;
+    uint64_t ContextHeaderSize;
+    uint64_t ContextAlignment;
+    uint64_t FrameOffset; // Start of the frame.
+    uint64_t ContextSize; // Includes frame size.
+    GlobalVariable *AsyncFuncPointer;
+
+    Align getContextAlignment() const { return Align(ContextAlignment); }
+  };
+
+  union {
+    SwitchLoweringStorage SwitchLowering;
+    RetconLoweringStorage RetconLowering;
+    AsyncLoweringStorage AsyncLowering;
+  };
+
+  CoroIdInst *getSwitchCoroId() const {
+    assert(ABI == coro::ABI::Switch);
+    return cast<CoroIdInst>(CoroBegin->getId());
+  }
+
+  AnyCoroIdRetconInst *getRetconCoroId() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+    return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
+  }
+
+  CoroIdAsyncInst *getAsyncCoroId() const {
+    assert(ABI == coro::ABI::Async);
+    return cast<CoroIdAsyncInst>(CoroBegin->getId());
+  }
+
+  unsigned getSwitchIndexField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return SwitchLowering.IndexField;
+  }
+  IntegerType *getIndexType() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
+  }
+  ConstantInt *getIndex(uint64_t Value) const {
+    return ConstantInt::get(getIndexType(), Value);
+  }
+
+  PointerType *getSwitchResumePointerType() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return cast<PointerType>(FrameTy->getElementType(SwitchFieldIndex::Resume));
+  }
+
+  FunctionType *getResumeFunctionType() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return FunctionType::get(Type::getVoidTy(FrameTy->getContext()),
+                               PointerType::getUnqual(FrameTy->getContext()),
+                               /*IsVarArg=*/false);
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getFunctionType();
+    case coro::ABI::Async:
+      // Not used. The function type depends on the active suspend.
+      return nullptr;
+    }
+
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  ArrayRef<Type *> getRetconResultTypes() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+    auto FTy = CoroBegin->getFunction()->getFunctionType();
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    if (auto STy = dyn_cast<StructType>(FTy->getReturnType())) {
+      return STy->elements().slice(1);
+    } else {
+      return ArrayRef<Type *>();
+    }
+  }
+
+  ArrayRef<Type *> getRetconResumeTypes() const {
+    assert(ABI == coro::ABI::Retcon || ABI == coro::ABI::RetconOnce);
+
+    // The safety of all this is checked by checkWFRetconPrototype.
+    auto FTy = RetconLowering.ResumePrototype->getFunctionType();
+    return FTy->params().slice(1);
+  }
+
+  CallingConv::ID getResumeFunctionCC() const {
+    switch (ABI) {
+    case coro::ABI::Switch:
+      return CallingConv::Fast;
+
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      return RetconLowering.ResumePrototype->getCallingConv();
+    case coro::ABI::Async:
+      return AsyncLowering.AsyncCC;
+    }
+    llvm_unreachable("Unknown coro::ABI enum");
+  }
+
+  AllocaInst *getPromiseAlloca() const {
+    if (ABI == coro::ABI::Switch)
+      return SwitchLowering.PromiseAlloca;
+    return nullptr;
+  }
+
+  BasicBlock::iterator getInsertPtAfterFramePtr() const {
+    if (auto *I = dyn_cast<Instruction>(FramePtr)) {
+      BasicBlock::iterator It = std::next(I->getIterator());
+      It.setHeadBit(true); // Copy pre-RemoveDIs behaviour.
+      return It;
+    }
+    return cast<Argument>(FramePtr)->getParent()->getEntryBlock().begin();
+  }
+
+  /// Allocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  Value *emitAlloc(IRBuilder<> &Builder, Value *Size, CallGraph *CG) const;
+
+  /// Deallocate memory according to the rules of the active lowering.
+  ///
+  /// \param CG - if non-null, will be updated for the new call
+  void emitDealloc(IRBuilder<> &Builder, Value *Ptr, CallGraph *CG) const;
+
+  Shape() = default;
+  explicit Shape(Function &F, bool OptimizeFrame = false)
+      : OptimizeFrame(OptimizeFrame) {
+    buildFrom(F);
+  }
+  void buildFrom(Function &F);
+};
+
+} // end namespace coro
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_COROUTINES_COROSHAPE_H
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index cdc442bc819c3..5cc13a584aef3 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -12,6 +12,7 @@
 
 #include "CoroInstr.h"
 #include "CoroInternal.h"
+#include "CoroShape.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/CallGraph.h"

From ba9e523436b62f4f683c14a5f5b7b5b089bd5ac9 Mon Sep 17 00:00:00 2001
From: tnowicki <tnowicki.nowicki@amd.com>
Date: Sat, 24 Aug 2024 02:33:38 -0400
Subject: [PATCH 43/43] [Coroutines] Refactor CoroShape::buildFrom to support
 ABI

* Refactor buildFrom to separate the analysis, abi related operations,
  tidying and bailout.
* In a follow-up PR the code in initABI will be moved to an ABI object
  init method. And the Shape constructor will no longer perform any
  lowering, instead it will just call analysis. This will make the Shape
  object a bit more useful because it can be constructed and used
  anywhere. It may even be useful to make it an analysis pass.
* In a follow-up PR the OptimizeFrame flag will also be removed from the
  Shape and instead will be passed directly to buildCoroutineFrame
  (although it would be nice to find another way to trigger this
  optimization). This is the only thing that Shape cannot determine from
  the Function/Coroutine, but it is only needed within
  buildCoroutineFrame.
* Note, that it was necessary to introduce two new SmallVectors, one to
  track CoroFrames and the other for UnusedCoroSaves. The tidyCoroutine
  method requires both, while invalidateCoroutine (bailout) method just
  requires the former.

See RFC for more info: https://discourse.llvm.org/t/rfc-abi-objects-for-coroutines/81057
---
 llvm/lib/Transforms/Coroutines/CoroShape.h    |  55 ++++++-
 llvm/lib/Transforms/Coroutines/Coroutines.cpp | 137 +++++++++---------
 2 files changed, 117 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroShape.h b/llvm/lib/Transforms/Coroutines/CoroShape.h
index f5798b63bf732..3d1b38082173d 100644
--- a/llvm/lib/Transforms/Coroutines/CoroShape.h
+++ b/llvm/lib/Transforms/Coroutines/CoroShape.h
@@ -50,15 +50,49 @@ enum class ABI {
 // Holds structural Coroutine Intrinsics for a particular function and other
 // values used during CoroSplit pass.
 struct LLVM_LIBRARY_VISIBILITY Shape {
-  CoroBeginInst *CoroBegin;
+  CoroBeginInst *CoroBegin = nullptr;
   SmallVector<AnyCoroEndInst *, 4> CoroEnds;
   SmallVector<CoroSizeInst *, 2> CoroSizes;
   SmallVector<CoroAlignInst *, 2> CoroAligns;
   SmallVector<AnyCoroSuspendInst *, 4> CoroSuspends;
-  SmallVector<CallInst *, 2> SwiftErrorOps;
   SmallVector<CoroAwaitSuspendInst *, 4> CoroAwaitSuspends;
   SmallVector<CallInst *, 2> SymmetricTransfers;
 
+  // Values invalidated by invalidateCoroutine() and tidyCoroutine()
+  SmallVector<CoroFrameInst *, 8> CoroFrames;
+  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
+
+  // Values invalidated by replaceSwiftErrorOps()
+  SmallVector<CallInst *, 2> SwiftErrorOps;
+
+  void clear() {
+    CoroBegin = nullptr;
+    CoroEnds.clear();
+    CoroSizes.clear();
+    CoroAligns.clear();
+    CoroSuspends.clear();
+    CoroAwaitSuspends.clear();
+    SymmetricTransfers.clear();
+
+    CoroFrames.clear();
+    UnusedCoroSaves.clear();
+
+    SwiftErrorOps.clear();
+
+    FrameTy = nullptr;
+    FramePtr = nullptr;
+    AllocaSpillBlock = nullptr;
+  }
+
+  // Scan the function and collect the above intrinsics for later processing
+  void analyze(Function &F);
+  // If for some reason, we were not able to find coro.begin, bailout.
+  void invalidateCoroutine(Function &F);
+  // Perform ABI related initial transformation
+  void initABI();
+  // Remove orphaned and unnecessary intrinsics
+  void tidyCoroutine();
+
   // Field indexes for special fields in the switch lowering.
   struct SwitchFieldIndex {
     enum {
@@ -76,11 +110,11 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
 
   coro::ABI ABI;
 
-  StructType *FrameTy;
+  StructType *FrameTy = nullptr;
   Align FrameAlign;
-  uint64_t FrameSize;
-  Value *FramePtr;
-  BasicBlock *AllocaSpillBlock;
+  uint64_t FrameSize = 0;
+  Value *FramePtr = nullptr;
+  BasicBlock *AllocaSpillBlock = nullptr;
 
   /// This would only be true if optimization are enabled.
   bool OptimizeFrame;
@@ -237,9 +271,14 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   Shape() = default;
   explicit Shape(Function &F, bool OptimizeFrame = false)
       : OptimizeFrame(OptimizeFrame) {
-    buildFrom(F);
+    analyze(F);
+    if (!CoroBegin) {
+      invalidateCoroutine(F);
+      return;
+    }
+    initABI();
+    tidyCoroutine();
   }
-  void buildFrom(Function &F);
 };
 
 } // end namespace coro
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 5cc13a584aef3..c1042b21883f6 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -177,17 +177,6 @@ void coro::suppressCoroAllocs(LLVMContext &Context,
   }
 }
 
-static void clear(coro::Shape &Shape) {
-  Shape.CoroBegin = nullptr;
-  Shape.CoroEnds.clear();
-  Shape.CoroSizes.clear();
-  Shape.CoroSuspends.clear();
-
-  Shape.FrameTy = nullptr;
-  Shape.FramePtr = nullptr;
-  Shape.AllocaSpillBlock = nullptr;
-}
-
 static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
                                     CoroSuspendInst *SuspendInst) {
   Module *M = SuspendInst->getModule();
@@ -200,13 +189,12 @@ static CoroSaveInst *createCoroSave(CoroBeginInst *CoroBegin,
 }
 
 // Collect "interesting" coroutine intrinsics.
-void coro::Shape::buildFrom(Function &F) {
+void coro::Shape::analyze(Function &F) {
+  clear();
+
   bool HasFinalSuspend = false;
   bool HasUnwindCoroEnd = false;
   size_t FinalSuspendIndex = 0;
-  clear(*this);
-  SmallVector<CoroFrameInst *, 8> CoroFrames;
-  SmallVector<CoroSaveInst *, 2> UnusedCoroSaves;
 
   for (Instruction &I : instructions(F)) {
     // FIXME: coro_await_suspend_* are not proper `IntrinisicInst`s
@@ -298,8 +286,58 @@ void coro::Shape::buildFrom(Function &F) {
     }
   }
 
-  // If for some reason, we were not able to find coro.begin, bailout.
-  if (!CoroBegin) {
+  // If there is no CoroBegin then this is not a coroutine.
+  if (!CoroBegin)
+    return;
+
+  // Determination of ABI and initializing lowering info
+  auto Id = CoroBegin->getId();
+  auto IntrID = Id->getIntrinsicID();
+  if (IntrID == Intrinsic::coro_id) {
+    ABI = coro::ABI::Switch;
+    SwitchLowering.HasFinalSuspend = HasFinalSuspend;
+    SwitchLowering.HasUnwindCoroEnd = HasUnwindCoroEnd;
+
+    auto SwitchId = getSwitchCoroId();
+    SwitchLowering.ResumeSwitch = nullptr;
+    SwitchLowering.PromiseAlloca = SwitchId->getPromise();
+    SwitchLowering.ResumeEntryBlock = nullptr;
+
+    // Move final suspend to the last element in the CoroSuspends vector.
+    if (SwitchLowering.HasFinalSuspend &&
+        FinalSuspendIndex != CoroSuspends.size() - 1)
+      std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
+  } else if (IntrID == Intrinsic::coro_id_async) {
+    ABI = coro::ABI::Async;
+    auto *AsyncId = getAsyncCoroId();
+    AsyncId->checkWellFormed();
+    AsyncLowering.Context = AsyncId->getStorage();
+    AsyncLowering.ContextArgNo = AsyncId->getStorageArgumentIndex();
+    AsyncLowering.ContextHeaderSize = AsyncId->getStorageSize();
+    AsyncLowering.ContextAlignment = AsyncId->getStorageAlignment().value();
+    AsyncLowering.AsyncFuncPointer = AsyncId->getAsyncFunctionPointer();
+    AsyncLowering.AsyncCC = F.getCallingConv();
+  } else if (IntrID == Intrinsic::coro_id_retcon ||
+             IntrID == Intrinsic::coro_id_retcon_once) {
+    ABI = IntrID == Intrinsic::coro_id_retcon ? coro::ABI::Retcon
+                                              : coro::ABI::RetconOnce;
+    auto ContinuationId = getRetconCoroId();
+    ContinuationId->checkWellFormed();
+    auto Prototype = ContinuationId->getPrototype();
+    RetconLowering.ResumePrototype = Prototype;
+    RetconLowering.Alloc = ContinuationId->getAllocFunction();
+    RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
+    RetconLowering.ReturnBlock = nullptr;
+    RetconLowering.IsFrameInlineInStorage = false;
+  } else {
+    llvm_unreachable("coro.begin is not dependent on a coro.id call");
+  }
+}
+
+// If for some reason, we were not able to find coro.begin, bailout.
+void coro::Shape::invalidateCoroutine(Function &F) {
+  assert(!CoroBegin);
+  {
     // Replace coro.frame which are supposed to be lowered to the result of
     // coro.begin with undef.
     auto *Undef = UndefValue::get(PointerType::get(F.getContext(), 0));
@@ -320,21 +358,13 @@ void coro::Shape::buildFrom(Function &F) {
     // Replace all coro.ends with unreachable instruction.
     for (AnyCoroEndInst *CE : CoroEnds)
       changeToUnreachable(CE);
-
-    return;
   }
+}
 
-  auto Id = CoroBegin->getId();
-  switch (auto IdIntrinsic = Id->getIntrinsicID()) {
-  case Intrinsic::coro_id: {
-    auto SwitchId = cast<CoroIdInst>(Id);
-    this->ABI = coro::ABI::Switch;
-    this->SwitchLowering.HasFinalSuspend = HasFinalSuspend;
-    this->SwitchLowering.HasUnwindCoroEnd = HasUnwindCoroEnd;
-    this->SwitchLowering.ResumeSwitch = nullptr;
-    this->SwitchLowering.PromiseAlloca = SwitchId->getPromise();
-    this->SwitchLowering.ResumeEntryBlock = nullptr;
-
+// Perform semantic checking and initialization of the ABI
+void coro::Shape::initABI() {
+  switch (ABI) {
+  case coro::ABI::Switch: {
     for (auto *AnySuspend : CoroSuspends) {
       auto Suspend = dyn_cast<CoroSuspendInst>(AnySuspend);
       if (!Suspend) {
@@ -349,33 +379,11 @@ void coro::Shape::buildFrom(Function &F) {
     }
     break;
   }
-  case Intrinsic::coro_id_async: {
-    auto *AsyncId = cast<CoroIdAsyncInst>(Id);
-    AsyncId->checkWellFormed();
-    this->ABI = coro::ABI::Async;
-    this->AsyncLowering.Context = AsyncId->getStorage();
-    this->AsyncLowering.ContextArgNo = AsyncId->getStorageArgumentIndex();
-    this->AsyncLowering.ContextHeaderSize = AsyncId->getStorageSize();
-    this->AsyncLowering.ContextAlignment =
-        AsyncId->getStorageAlignment().value();
-    this->AsyncLowering.AsyncFuncPointer = AsyncId->getAsyncFunctionPointer();
-    this->AsyncLowering.AsyncCC = F.getCallingConv();
+  case coro::ABI::Async: {
     break;
   };
-  case Intrinsic::coro_id_retcon:
-  case Intrinsic::coro_id_retcon_once: {
-    auto ContinuationId = cast<AnyCoroIdRetconInst>(Id);
-    ContinuationId->checkWellFormed();
-    this->ABI = (IdIntrinsic == Intrinsic::coro_id_retcon
-                  ? coro::ABI::Retcon
-                  : coro::ABI::RetconOnce);
-    auto Prototype = ContinuationId->getPrototype();
-    this->RetconLowering.ResumePrototype = Prototype;
-    this->RetconLowering.Alloc = ContinuationId->getAllocFunction();
-    this->RetconLowering.Dealloc = ContinuationId->getDeallocFunction();
-    this->RetconLowering.ReturnBlock = nullptr;
-    this->RetconLowering.IsFrameInlineInStorage = false;
-
+  case coro::ABI::Retcon:
+  case coro::ABI::RetconOnce: {
     // Determine the result value types, and make sure they match up with
     // the values passed to the suspends.
     auto ResultTys = getRetconResultTypes();
@@ -408,7 +416,7 @@ void coro::Shape::buildFrom(Function &F) {
 
 #ifndef NDEBUG
           Suspend->dump();
-          Prototype->getFunctionType()->dump();
+          RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
           report_fatal_error("argument to coro.suspend.retcon does not "
                              "match corresponding prototype function result");
@@ -417,14 +425,14 @@ void coro::Shape::buildFrom(Function &F) {
       if (SI != SE || RI != RE) {
 #ifndef NDEBUG
         Suspend->dump();
-        Prototype->getFunctionType()->dump();
+        RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
         report_fatal_error("wrong number of arguments to coro.suspend.retcon");
       }
 
       // Check that the result type of the suspend matches the resume types.
       Type *SResultTy = Suspend->getType();
-      ArrayRef<Type*> SuspendResultTys;
+      ArrayRef<Type *> SuspendResultTys;
       if (SResultTy->isVoidTy()) {
         // leave as empty array
       } else if (auto SResultStructTy = dyn_cast<StructType>(SResultTy)) {
@@ -436,7 +444,7 @@ void coro::Shape::buildFrom(Function &F) {
       if (SuspendResultTys.size() != ResumeTys.size()) {
 #ifndef NDEBUG
         Suspend->dump();
-        Prototype->getFunctionType()->dump();
+        RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
         report_fatal_error("wrong number of results from coro.suspend.retcon");
       }
@@ -444,7 +452,7 @@ void coro::Shape::buildFrom(Function &F) {
         if (SuspendResultTys[I] != ResumeTys[I]) {
 #ifndef NDEBUG
           Suspend->dump();
-          Prototype->getFunctionType()->dump();
+          RetconLowering.ResumePrototype->getFunctionType()->dump();
 #endif
           report_fatal_error("result from coro.suspend.retcon does not "
                              "match corresponding prototype function param");
@@ -453,23 +461,18 @@ void coro::Shape::buildFrom(Function &F) {
     }
     break;
   }
-
   default:
     llvm_unreachable("coro.begin is not dependent on a coro.id call");
   }
+}
 
+void coro::Shape::tidyCoroutine() {
   // The coro.free intrinsic is always lowered to the result of coro.begin.
   for (CoroFrameInst *CF : CoroFrames) {
     CF->replaceAllUsesWith(CoroBegin);
     CF->eraseFromParent();
   }
 
-  // Move final suspend to be the last element in the CoroSuspends vector.
-  if (ABI == coro::ABI::Switch &&
-      SwitchLowering.HasFinalSuspend &&
-      FinalSuspendIndex != CoroSuspends.size() - 1)
-    std::swap(CoroSuspends[FinalSuspendIndex], CoroSuspends.back());
-
   // Remove orphaned coro.saves.
   for (CoroSaveInst *CoroSave : UnusedCoroSaves)
     CoroSave->eraseFromParent();