From 28b9cdd26073c79be36c79476a9bacceca3d116f Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 8 Jan 2020 14:31:27 -0800
Subject: [PATCH 01/89] Delete dead code.

https://reviews.llvm.org/D58856
---
 .../SymbolFile/DWARF/DWARFASTParserClang.cpp  | 48 -------------------
 1 file changed, 48 deletions(-)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
index bb75c699352e7..232063a6f3390 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParserClang.cpp
@@ -2298,54 +2298,6 @@ size_t DWARFASTParserClang::ParseChildEnumerators(
   return enumerators_added;
 }
 
-#if defined(LLDB_CONFIGURATION_DEBUG) || defined(LLDB_CONFIGURATION_RELEASE)
-
-class DIEStack {
-public:
-  void Push(const DWARFDIE &die) { m_dies.push_back(die); }
-
-  void LogDIEs(Log *log) {
-    StreamString log_strm;
-    const size_t n = m_dies.size();
-    log_strm.Printf("DIEStack[%" PRIu64 "]:\n", (uint64_t)n);
-    for (size_t i = 0; i < n; i++) {
-      std::string qualified_name;
-      const DWARFDIE &die = m_dies[i];
-      die.GetQualifiedName(qualified_name);
-      log_strm.Printf("[%" PRIu64 "] 0x%8.8x: %s name='%s'\n", (uint64_t)i,
-                      die.GetOffset(), die.GetTagAsCString(),
-                      qualified_name.c_str());
-    }
-    log->PutCString(log_strm.GetData());
-  }
-  void Pop() { m_dies.pop_back(); }
-
-  class ScopedPopper {
-  public:
-    ScopedPopper(DIEStack &die_stack)
-        : m_die_stack(die_stack), m_valid(false) {}
-
-    void Push(const DWARFDIE &die) {
-      m_valid = true;
-      m_die_stack.Push(die);
-    }
-
-    ~ScopedPopper() {
-      if (m_valid)
-        m_die_stack.Pop();
-    }
-
-  protected:
-    DIEStack &m_die_stack;
-    bool m_valid;
-  };
-
-protected:
-  typedef std::vector<DWARFDIE> Stack;
-  Stack m_dies;
-};
-#endif
-
 Function *DWARFASTParserClang::ParseFunctionFromDWARF(CompileUnit &comp_unit,
                                                       const DWARFDIE &die) {
   DWARFRangeList func_ranges;

From 58deb20dd2dfbfbfff8097ce80137d12a57a3607 Mon Sep 17 00:00:00 2001
From: Evgenii Stepanov <eugenis@google.com>
Date: Wed, 8 Jan 2020 14:33:28 -0800
Subject: [PATCH 02/89] Revert "Merge memtag instructions with adjacent stack
 slots."

*** Bad machine code: Tied use must be a register ***
- function:    stg_alloca17
- basic block: %bb.0 entry (0x20076710580)
- instruction: early-clobber %0:gpr64common, early-clobber %1:gpr64sp = STGloop 272, %stack.0.a :: (store 272 into %ir.a, align 16)
- operand 3:   %stack.0.a

http://lab.llvm.org:8011/builders/llvm-clang-x86_64-expensive-checks-win/builds/21481/steps/test-check-all/logs/stdio

This reverts commit b675a7628ce6a21b1e4a71c079a67badfb8b073d.
---
 .../llvm/CodeGen/TargetFrameLowering.h        |   7 -
 llvm/lib/CodeGen/PrologEpilogInserter.cpp     |   4 -
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  24 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   | 436 +-----------------
 .../lib/Target/AArch64/AArch64FrameLowering.h |   6 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   2 -
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  12 +-
 .../Target/AArch64/AArch64RegisterInfo.cpp    |  21 +-
 .../AArch64/AArch64SelectionDAGInfo.cpp       |  18 +-
 llvm/test/CodeGen/AArch64/settag-merge.ll     | 214 ---------
 llvm/test/CodeGen/AArch64/settag-merge.mir    |  83 ----
 llvm/test/CodeGen/AArch64/settag.ll           |  17 +-
 .../AArch64/stack-tagging-unchecked-ld-st.ll  |   7 +-
 13 files changed, 43 insertions(+), 808 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/settag-merge.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/settag-merge.mir

diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index a0beee36c7484..c7d4c4d7e5d44 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -309,13 +309,6 @@ class TargetFrameLowering {
                                              RegScavenger *RS = nullptr) const {
   }
 
-  /// processFunctionBeforeFrameIndicesReplaced - This method is called
-  /// immediately before MO_FrameIndex operands are eliminated, but after the
-  /// frame is finalized. This method is optional.
-  virtual void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS = nullptr) const {}
-
   virtual unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const {
     report_fatal_error("WinEH not implemented for this target");
   }
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index d583643ac68f7..3909b57172814 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -259,10 +259,6 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   for (auto &I : EntryDbgValues)
     I.first->insert(I.first->begin(), I.second.begin(), I.second.end());
 
-  // Allow the target machine to make final modifications to the function
-  // before the frame layout is finalized.
-  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
-
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 97162ae221871..3b8f8a19fe49c 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -349,38 +349,22 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  Register SizeReg = MI.getOperand(0).getReg();
-  Register AddressReg = MI.getOperand(1).getReg();
+  Register SizeReg = MI.getOperand(2).getReg();
+  Register AddressReg = MI.getOperand(3).getReg();
 
   MachineFunction *MF = MBB.getParent();
 
   bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
-  const unsigned OpCode1 =
-      ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
-  const unsigned OpCode2 =
+  const unsigned OpCode =
       ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
 
-  unsigned Size = MI.getOperand(2).getImm();
-  assert(Size > 0 && Size % 16 == 0);
-  if (Size % (16 * 2) != 0) {
-    BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
-        .addReg(AddressReg)
-        .addReg(AddressReg)
-        .addImm(1);
-    Size -= 16;
-  }
-  MachineBasicBlock::iterator I =
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
-          .addImm(Size);
-  expandMOVImm(MBB, I, 64);
-
   auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
   auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
 
   MF->insert(++MBB.getIterator(), LoopBB);
   MF->insert(++LoopBB->getIterator(), DoneBB);
 
-  BuildMI(LoopBB, DL, TII->get(OpCode2))
+  BuildMI(LoopBB, DL, TII->get(OpCode))
       .addDef(AddressReg)
       .addReg(AddressReg)
       .addReg(AddressReg)
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 39d32863f15b2..c732106014e6c 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,11 +170,6 @@ static cl::opt<bool>
                          cl::desc("reverse the CSR restore sequence"),
                          cl::init(false), cl::Hidden);
 
-static cl::opt<bool> StackTaggingMergeSetTag(
-    "stack-tagging-merge-settag",
-    cl::desc("merge settag instruction in function epilog"), cl::init(true),
-    cl::Hidden);
-
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
 /// This is the biggest offset to the stack pointer we can encode in aarch64
@@ -485,39 +480,6 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
-bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
-    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
-  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
-    return false;
-
-  if (MBB.empty())
-    return true;
-
-  // Disable combined SP bump if the last instruction is an MTE tag store. It
-  // is almost always better to merge SP adjustment into those instructions.
-  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
-  MachineBasicBlock::iterator Begin = MBB.begin();
-  while (LastI != Begin) {
-    --LastI;
-    if (LastI->isTransient())
-      continue;
-    if (!LastI->getFlag(MachineInstr::FrameDestroy))
-      break;
-  }
-  switch (LastI->getOpcode()) {
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
-  case AArch64::STGOffset:
-  case AArch64::STZGOffset:
-  case AArch64::ST2GOffset:
-  case AArch64::STZ2GOffset:
-    return false;
-  default:
-    return true;
-  }
-  llvm_unreachable("unreachable");
-}
-
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -1497,7 +1459,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
+  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -2675,399 +2637,9 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       .addImm(0);
 }
 
-namespace {
-struct TagStoreInstr {
-  MachineInstr *MI;
-  int64_t Offset, Size;
-  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
-      : MI(MI), Offset(Offset), Size(Size) {}
-};
-
-class TagStoreEdit {
-  MachineFunction *MF;
-  MachineBasicBlock *MBB;
-  MachineRegisterInfo *MRI;
-  // Tag store instructions that are being replaced.
-  SmallVector<TagStoreInstr, 8> TagStores;
-  // Combined memref arguments of the above instructions.
-  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
-
-  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
-  // FrameRegOffset + Size) with the address tag of SP.
-  Register FrameReg;
-  StackOffset FrameRegOffset;
-  int64_t Size;
-  // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
-  Optional<int64_t> FrameRegUpdate;
-  // MIFlags for any FrameReg updating instructions.
-  unsigned FrameRegUpdateFlags;
-
-  // Use zeroing instruction variants.
-  bool ZeroData;
-  DebugLoc DL;
-
-  void emitUnrolled(MachineBasicBlock::iterator InsertI);
-  void emitLoop(MachineBasicBlock::iterator InsertI);
-
-public:
-  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
-      : MBB(MBB), ZeroData(ZeroData) {
-    MF = MBB->getParent();
-    MRI = &MF->getRegInfo();
-  }
-  // Add an instruction to be replaced. Instructions must be added in the
-  // ascending order of Offset, and have to be adjacent.
-  void addInstruction(TagStoreInstr I) {
-    assert((TagStores.empty() ||
-            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
-           "Non-adjacent tag store instructions.");
-    TagStores.push_back(I);
-  }
-  void clear() { TagStores.clear(); }
-  // Emit equivalent code at the given location, and erase the current set of
-  // instructions. May skip if the replacement is not profitable. May invalidate
-  // the input iterator and replace it with a valid one.
-  void emitCode(MachineBasicBlock::iterator &InsertI,
-                const AArch64FrameLowering *TFI, bool IsLast);
-};
-
-void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  const int64_t kMinOffset = -256 * 16;
-  const int64_t kMaxOffset = 255 * 16;
-
-  Register BaseReg = FrameReg;
-  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
-  if (BaseRegOffsetBytes < kMinOffset ||
-      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
-    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
-                    {BaseRegOffsetBytes, MVT::i8}, TII);
-    BaseReg = ScratchReg;
-    BaseRegOffsetBytes = 0;
-  }
-
-  MachineInstr *LastI = nullptr;
-  while (Size) {
-    int64_t InstrSize = (Size > 16) ? 32 : 16;
-    unsigned Opcode =
-        InstrSize == 16
-            ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
-            : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
-    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
-                          .addReg(AArch64::SP)
-                          .addReg(BaseReg)
-                          .addImm(BaseRegOffsetBytes / 16)
-                          .setMemRefs(CombinedMemRefs);
-    // A store to [BaseReg, #0] should go last for an opportunity to fold the
-    // final SP adjustment in the epilogue.
-    if (BaseRegOffsetBytes == 0)
-      LastI = I;
-    BaseRegOffsetBytes += InstrSize;
-    Size -= InstrSize;
-  }
-
-  if (LastI)
-    MBB->splice(InsertI, MBB, LastI);
-}
-
-void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
-  const AArch64InstrInfo *TII =
-      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
-
-  Register BaseReg = FrameRegUpdate
-                         ? FrameReg
-                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
-
-  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
-
-  int64_t LoopSize = Size;
-  // If the loop size is not a multiple of 32, split off one 16-byte store at
-  // the end to fold BaseReg update into.
-  if (FrameRegUpdate && *FrameRegUpdate)
-    LoopSize -= LoopSize % 32;
-  MachineInstr *LoopI =
-      BuildMI(*MBB, InsertI, DL,
-              TII->get(ZeroData ? AArch64::STZGloop : AArch64::STGloop))
-          .addDef(SizeReg)
-          .addDef(BaseReg)
-          .addImm(LoopSize)
-          .addReg(BaseReg)
-          .setMemRefs(CombinedMemRefs);
-  if (FrameRegUpdate)
-    LoopI->setFlags(FrameRegUpdateFlags);
-
-  int64_t ExtraBaseRegUpdate =
-      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
-  if (LoopSize < Size) {
-    assert(FrameRegUpdate);
-    assert(Size - LoopSize == 16);
-    // Tag 16 more bytes at BaseReg and update BaseReg.
-    BuildMI(*MBB, InsertI, DL,
-            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addReg(BaseReg)
-        .addImm(1 + ExtraBaseRegUpdate / 16)
-        .setMemRefs(CombinedMemRefs)
-        .setMIFlags(FrameRegUpdateFlags);
-  } else if (ExtraBaseRegUpdate) {
-    // Update BaseReg.
-    BuildMI(
-        *MBB, InsertI, DL,
-        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
-        .addDef(BaseReg)
-        .addReg(BaseReg)
-        .addImm(std::abs(ExtraBaseRegUpdate))
-        .addImm(0)
-        .setMIFlags(FrameRegUpdateFlags);
-  }
-}
-
-// Check if *II is a register update that can be merged into STGloop that ends
-// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
-// end of the loop.
-bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
-                       int64_t Size, int64_t *TotalOffset) {
-  MachineInstr &MI = *II;
-  if ((MI.getOpcode() == AArch64::ADDXri ||
-       MI.getOpcode() == AArch64::SUBXri) &&
-      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
-    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
-    int64_t Offset = MI.getOperand(2).getImm() << Shift;
-    if (MI.getOpcode() == AArch64::SUBXri)
-      Offset = -Offset;
-    int64_t AbsPostOffset = std::abs(Offset - Size);
-    const int64_t kMaxOffset =
-        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
-    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
-      *TotalOffset = Offset;
-      return true;
-    }
-  }
-  return false;
-}
-
-void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
-                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
-  MemRefs.clear();
-  for (auto &TS : TSE) {
-    MachineInstr *MI = TS.MI;
-    // An instruction without memory operands may access anything. Be
-    // conservative and return an empty list.
-    if (MI->memoperands_empty()) {
-      MemRefs.clear();
-      return;
-    }
-    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
-  }
-}
-
-void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
-                            const AArch64FrameLowering *TFI, bool IsLast) {
-  if (TagStores.empty())
-    return;
-  TagStoreInstr &FirstTagStore = TagStores[0];
-  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
-  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
-  DL = TagStores[0].MI->getDebugLoc();
-
-  unsigned Reg;
-  FrameRegOffset = TFI->resolveFrameOffsetReference(
-      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
-      /*PreferFP=*/false, /*ForSimm=*/true);
-  FrameReg = Reg;
-  FrameRegUpdate = None;
-
-  mergeMemRefs(TagStores, CombinedMemRefs);
-
-  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
-             for (const auto &Instr
-                  : TagStores) { dbgs() << "  " << *Instr.MI; });
-
-  // Size threshold where a loop becomes shorter than a linear sequence of
-  // tagging instructions.
-  const int kSetTagLoopThreshold = 176;
-  if (Size < kSetTagLoopThreshold) {
-    if (TagStores.size() < 2)
-      return;
-    emitUnrolled(InsertI);
-  } else {
-    MachineInstr *UpdateInstr = nullptr;
-    int64_t TotalOffset;
-    if (IsLast) {
-      // See if we can merge base register update into the STGloop.
-      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
-      // but STGloop is way too unusual for that, and also it only
-      // realistically happens in function epilogue. Also, STGloop is expanded
-      // before that pass.
-      if (InsertI != MBB->end() &&
-          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
-                            &TotalOffset)) {
-        UpdateInstr = &*InsertI++;
-        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
-                          << *UpdateInstr);
-      }
-    }
-
-    if (!UpdateInstr && TagStores.size() < 2)
-      return;
-
-    if (UpdateInstr) {
-      FrameRegUpdate = TotalOffset;
-      FrameRegUpdateFlags = UpdateInstr->getFlags();
-    }
-    emitLoop(InsertI);
-    if (UpdateInstr)
-      UpdateInstr->eraseFromParent();
-  }
-
-  for (auto &TS : TagStores)
-    TS.MI->eraseFromParent();
-}
-
-bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
-                                        int64_t &Size, bool &ZeroData) {
-  MachineFunction &MF = *MI.getParent()->getParent();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-  unsigned Opcode = MI.getOpcode();
-  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
-              Opcode == AArch64::STZ2GOffset);
-
-  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
-    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
-      return false;
-    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
-      return false;
-    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
-    Size = MI.getOperand(2).getImm();
-    return true;
-  }
-
-  if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
-    Size = 16;
-  else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
-    Size = 32;
-  else
-    return false;
-
-  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
-    return false;
-
-  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
-           16 * MI.getOperand(2).getImm();
-  return true;
-}
-
-// Detect a run of memory tagging instructions for adjacent stack frame slots,
-// and replace them with a shorter instruction sequence:
-// * replace STG + STG with ST2G
-// * replace STGloop + STGloop with STGloop
-// This code needs to run when stack slot offsets are already known, but before
-// FrameIndex operands in STG instructions are eliminated.
-MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
-                                                const AArch64FrameLowering *TFI,
-                                                RegScavenger *RS) {
-  bool FirstZeroData;
-  int64_t Size, Offset;
-  MachineInstr &MI = *II;
-  MachineBasicBlock *MBB = MI.getParent();
-  MachineBasicBlock::iterator NextI = ++II;
-  if (&MI == &MBB->instr_back())
-    return II;
-  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
-    return II;
-
-  SmallVector<TagStoreInstr, 4> Instrs;
-  Instrs.emplace_back(&MI, Offset, Size);
-
-  constexpr int kScanLimit = 10;
-  int Count = 0;
-  for (MachineBasicBlock::iterator E = MBB->end();
-       NextI != E && Count < kScanLimit; ++NextI) {
-    MachineInstr &MI = *NextI;
-    bool ZeroData;
-    int64_t Size, Offset;
-    // Collect instructions that update memory tags with a FrameIndex operand
-    // and (when applicable) constant size, and whose output registers are dead
-    // (the latter is almost always the case in practice). Since these
-    // instructions effectively have no inputs or outputs, we are free to skip
-    // any non-aliasing instructions in between without tracking used registers.
-    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
-      if (ZeroData != FirstZeroData)
-        break;
-      Instrs.emplace_back(&MI, Offset, Size);
-      continue;
-    }
-
-    // Only count non-transient, non-tagging instructions toward the scan
-    // limit.
-    if (!MI.isTransient())
-      ++Count;
-
-    // Just in case, stop before the epilogue code starts.
-    if (MI.getFlag(MachineInstr::FrameSetup) ||
-        MI.getFlag(MachineInstr::FrameDestroy))
-      break;
-
-    // Reject anything that may alias the collected instructions.
-    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
-      break;
-  }
-
-  // New code will be inserted after the last tagging instruction we've found.
-  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
-  InsertI++;
-
-  llvm::stable_sort(Instrs,
-                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
-                      return Left.Offset < Right.Offset;
-                    });
-
-  // Make sure that we don't have any overlapping stores.
-  int64_t CurOffset = Instrs[0].Offset;
-  for (auto &Instr : Instrs) {
-    if (CurOffset > Instr.Offset)
-      return NextI;
-    CurOffset = Instr.Offset + Instr.Size;
-  }
-
-  // Find contiguous runs of tagged memory and emit shorter instruction
-  // sequencies for them when possible.
-  TagStoreEdit TSE(MBB, FirstZeroData);
-  Optional<int64_t> EndOffset;
-  for (auto &Instr : Instrs) {
-    if (EndOffset && *EndOffset != Instr.Offset) {
-      // Found a gap.
-      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
-      TSE.clear();
-    }
-
-    TSE.addInstruction(Instr);
-    EndOffset = Instr.Offset + Instr.Size;
-  }
-
-  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
-
-  return InsertI;
-}
-} // namespace
-
-void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
-    MachineFunction &MF, RegScavenger *RS = nullptr) const {
-  if (StackTaggingMergeSetTag)
-    for (auto &BB : MF)
-      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
-        II = tryMergeAdjacentSTG(II, this, RS);
-}
-
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
-/// before the update.  This is easily retrieved as it is exactly the offset
-/// that is set in processFunctionBeforeFrameFinalized.
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
+/// the update.  This is easily retrieved as it is exactly the offset that is set
+/// in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
     const MachineFunction &MF, int FI, unsigned &FrameReg,
     bool IgnoreSPUpdates) const {
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index 57a7924fb8f8f..b5719feb6b154 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -77,10 +77,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                              RegScavenger *RS) const override;
 
-  void
-  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
-                                            RegScavenger *RS) const override;
-
   unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
 
   unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
@@ -111,8 +107,6 @@ class AArch64FrameLowering : public TargetFrameLowering {
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
-  bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
-                                                unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 0ed2a678c4f01..54f3f7c101324 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -3458,8 +3458,6 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   case AArch64::ST1Fourv1d:
   case AArch64::IRG:
   case AArch64::IRGstack:
-  case AArch64::STGloop:
-  case AArch64::STZGloop:
     return AArch64FrameOffsetCannotUpdate;
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 04a23f31ffd60..f4d340c9f06a0 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -1514,17 +1514,17 @@ def TAGPstack
 // register / expression for the tagged base pointer of the current function.
 def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
-// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range. $Rm is the loop counter.
+// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range.
 let isCodeGenOnly=1, mayStore=1 in {
 def STGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
       Sched<[WriteAdr, WriteST]>;
 
 def STZGloop
-    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
       Sched<[WriteAdr, WriteST]>;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 4a3778a2fd072..14f839cd4f812 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -390,10 +390,6 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
     return false;
 
-  // If even offset 0 is illegal, we don't want a virtual base register.
-  if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
-    return false;
-
   // The offset likely isn't legal; we want to allocate a virtual base register.
   return true;
 }
@@ -449,17 +445,6 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   (void)Done;
 }
 
-static Register getScratchRegisterForInstruction(MachineInstr &MI) {
-  // ST*Gloop can only have #fi in op3, and they have a constraint that
-  // op1==op3. Since op1 is early-clobber, it may (and also must) be used as the
-  // scratch register.
-  if (MI.getOpcode() == AArch64::STGloop || MI.getOpcode() == AArch64::STZGloop)
-    return MI.getOperand(1).getReg();
-  else
-    return MI.getMF()->getRegInfo().createVirtualRegister(
-        &AArch64::GPR64RegClass);
-}
-
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                               int SPAdj, unsigned FIOperandNum,
                                               RegScavenger *RS) const {
@@ -516,7 +501,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
       // in a scratch register.
       Offset = TFI->resolveFrameIndexReference(
           MF, FrameIndex, FrameReg, /*PreferFP=*/false, /*ForSimm=*/true);
-      Register ScratchReg = getScratchRegisterForInstruction(MI);
+      Register ScratchReg =
+          MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
       emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset,
                       TII);
       BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(AArch64::LDG), ScratchReg)
@@ -545,7 +531,8 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg = getScratchRegisterForInstruction(MI);
+  Register ScratchReg =
+      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index e050a0028eca6..ba61ed726e840 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -125,13 +125,19 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
     return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
                               ZeroData);
 
-  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
-
-  if (Addr.getOpcode() == ISD::FrameIndex) {
-    int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
-    Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+  if (ObjSize % 32 != 0) {
+    SDNode *St1 = DAG.getMachineNode(
+        ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
+        {MVT::i64, MVT::Other},
+        {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
+    DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
+    ObjSize -= 16;
+    Addr = SDValue(St1, 0);
+    Chain = SDValue(St1, 1);
   }
-  SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+
+  const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
+  SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
   SDNode *St = DAG.getMachineNode(
       ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
 
diff --git a/llvm/test/CodeGen/AArch64/settag-merge.ll b/llvm/test/CodeGen/AArch64/settag-merge.ll
deleted file mode 100644
index 1bc93a82070f0..0000000000000
--- a/llvm/test/CodeGen/AArch64/settag-merge.ll
+++ /dev/null
@@ -1,214 +0,0 @@
-; RUN: llc < %s -mtriple=aarch64 -mattr=+mte | FileCheck %s
-
-declare void @use(i8* %p)
-declare void @llvm.aarch64.settag(i8* %p, i64 %a)
-declare void @llvm.aarch64.settag.zero(i8* %p, i64 %a)
-
-define void @stg16_16() {
-entry:
-; CHECK-LABEL: stg16_16:
-; CHECK: st2g sp, [sp], #32
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  ret void
-}
-
-define i32 @stg16_16_16_16_ret() {
-entry:
-; CHECK-LABEL: stg16_16_16_16_ret:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: mov  w0, wzr
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret i32 0
-}
-
-define void @stg16_16_16_16() {
-entry:
-; CHECK-LABEL: stg16_16_16_16:
-; CHECK: st2g sp, [sp, #32]
-; CHECK: st2g sp, [sp], #64
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 16, align 16
-  %c = alloca i8, i32 16, align 16
-  %d = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 16)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  call void @llvm.aarch64.settag(i8* %d, i64 16)
-  ret void
-}
-
-define void @stg128_128_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_128_128:
-; CHECK: mov     x8, #512
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 128, align 16
-  %d = alloca i8, i32 128, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %d, i64 128)
-  ret void
-}
-
-define void @stg16_512_16() {
-entry:
-; CHECK-LABEL: stg16_512_16:
-; CHECK: mov     x8, #544
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 16, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 16, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 16)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 16)
-  ret void
-}
-
-define void @stg512_512_512() {
-entry:
-; CHECK-LABEL: stg512_512_512:
-; CHECK: mov     x8, #1536
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 512, align 16
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  call void @llvm.aarch64.settag(i8* %c, i64 512)
-  ret void
-}
-
-define void @early(i1 %flag) {
-entry:
-; CHECK-LABEL: early:
-; CHECK: tbz     w0, #0, [[LABEL:.LBB.*]]
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: st2g    sp, [sp, #
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 48, align 16
-  %b = alloca i8, i32 48, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 48)
-  call void @llvm.aarch64.settag(i8* %b, i64 48)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_128_128(i1 %flag) {
-entry:
-; CHECK-LABEL: early_128_128:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #256
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 128, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %b, i64 128)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-define void @early_512_512(i1 %flag) {
-entry:
-; CHECK-LABEL: early_512_512:
-; CHECK: tbz   w0, #0, [[LABEL:.LBB.*]]
-; CHECK: add   x9, sp, #
-; CHECK: mov   x8, #1024
-; CHECK: st2g  x9, [x9], #32
-; CHECK: sub   x8, x8, #32
-; CHECK: cbnz  x8,
-; CHECK: [[LABEL]]:
-; CHECK: stg     sp, [sp, #
-; CHECK: st2g    sp, [sp], #
-; CHECK: ret
-  %a = alloca i8, i32 512, align 16
-  %b = alloca i8, i32 512, align 16
-  %c = alloca i8, i32 48, align 16
-  br i1 %flag, label %if.then, label %if.end
-
-if.then:
-  call void @llvm.aarch64.settag(i8* %a, i64 512)
-  call void @llvm.aarch64.settag(i8* %b, i64 512)
-  br label %if.end
-
-if.end:
-  call void @llvm.aarch64.settag(i8* %c, i64 48)
-  ret void
-}
-
-; Two loops of size 256; the second loop updates SP.
-define void @stg128_128_gap_128_128() {
-entry:
-; CHECK-LABEL: stg128_128_gap_128_128:
-; CHECK: mov     x9, sp
-; CHECK: mov     x8, #256
-; CHECK: st2g    x9, [x9], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: mov     x8, #256
-; CHECK: st2g    sp, [sp], #32
-; CHECK: sub     x8, x8, #32
-; CHECK: cbnz    x8,
-; CHECK: ret
-  %a = alloca i8, i32 128, align 16
-  %a2 = alloca i8, i32 128, align 16
-  %b = alloca i8, i32 32, align 16
-  %c = alloca i8, i32 128, align 16
-  %c2 = alloca i8, i32 128, align 16
-  call void @use(i8* %b)
-  call void @llvm.aarch64.settag(i8* %a, i64 128)
-  call void @llvm.aarch64.settag(i8* %a2, i64 128)
-  call void @llvm.aarch64.settag(i8* %c, i64 128)
-  call void @llvm.aarch64.settag(i8* %c2, i64 128)
-  ret void
-}
diff --git a/llvm/test/CodeGen/AArch64/settag-merge.mir b/llvm/test/CodeGen/AArch64/settag-merge.mir
deleted file mode 100644
index dc2a00c7d3d37..0000000000000
--- a/llvm/test/CodeGen/AArch64/settag-merge.mir
+++ /dev/null
@@ -1,83 +0,0 @@
-# RUN: llc -mtriple=aarch64 -mattr=+mte -run-pass=prologepilog %s -o - | FileCheck %s
-
---- |
-  declare void @llvm.aarch64.settag(i8* nocapture writeonly, i64) argmemonly nounwind writeonly "target-features"="+mte"
-  define i32 @stg16_16_16_16_ret() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 16, align 16
-    %c = alloca i8, i32 16, align 16
-    %d = alloca i8, i32 16, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    call void @llvm.aarch64.settag(i8* %b, i64 16)
-    call void @llvm.aarch64.settag(i8* %c, i64 16)
-    call void @llvm.aarch64.settag(i8* %d, i64 16)
-    ret i32 0
-  }
-
-  define void @stg16_store_128() "target-features"="+mte" {
-  entry:
-    %a = alloca i8, i32 16, align 16
-    %b = alloca i8, i32 128, align 16
-    call void @llvm.aarch64.settag(i8* %a, i64 16)
-    store i8 42, i8* %a
-    call void @llvm.aarch64.settag(i8* %b, i64 128)
-    ret void
-  }
-
-...
----
-# A sequence of STG with a register copy in the middle.
-# Can be merged into ST2G + ST2G.
-# CHECK-LABEL: name:{{.*}}stg16_16_16_16_ret
-# CHECK-DAG: ST2GOffset $sp, $sp, 2
-# CHECK-DAG: ST2GOffset $sp, $sp, 0
-# CHECK-DAG: $w0 = COPY $wzr
-# CHECK-DAG: RET_ReallyLR implicit killed $w0
-
-name:            stg16_16_16_16_ret
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 16, alignment: 16 }
-  - { id: 2, name: c, size: 16, alignment: 16 }
-  - { id: 3, name: d, size: 16, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    STGOffset $sp, %stack.1.b, 0 :: (store 16 into %ir.b)
-    STGOffset $sp, %stack.2.c, 0 :: (store 16 into %ir.c)
-    $w0 = COPY $wzr
-    STGOffset $sp, %stack.3.d, 0 :: (store 16 into %ir.d)
-    RET_ReallyLR implicit killed $w0
-
-...
-
----
-# A store in the middle prevents merging.
-# CHECK-LABEL: name:{{.*}}stg16_store_128
-# CHECK: ST2GOffset $sp, $sp, 2
-# CHECK: ST2GOffset $sp, $sp, 4
-# CHECK: ST2GOffset $sp, $sp, 6
-# CHECK: STGOffset  $sp, $sp, 8
-# CHECK: STRBBui
-# CHECK: ST2GOffset $sp, $sp, 0 
-# CHECK: RET_ReallyLR
-
-name:            stg16_store_128
-tracksRegLiveness: true
-stack:
-  - { id: 0, name: a, size: 16, alignment: 16 }
-  - { id: 1, name: b, size: 128, alignment: 16 }
-body:             |
-  bb.0.entry:
-    STGOffset $sp, %stack.0.a, 0 :: (store 16 into %ir.a)
-    renamable $w8 = MOVi32imm 42
-    ST2GOffset $sp, %stack.1.b, 6 :: (store 32 into %ir.b + 96, align 16)
-    ST2GOffset $sp, %stack.1.b, 4 :: (store 32 into %ir.b + 64, align 16)
-    ST2GOffset $sp, %stack.1.b, 2 :: (store 32 into %ir.b + 32, align 16)
-    STRBBui killed renamable $w8, %stack.0.a, 0 :: (store 1 into %ir.a, align 16)
-    ST2GOffset $sp, %stack.1.b, 0 :: (store 32 into %ir.b, align 16)
-    RET_ReallyLR
-
-...
diff --git a/llvm/test/CodeGen/AArch64/settag.ll b/llvm/test/CodeGen/AArch64/settag.ll
index 3deeb0155fe87..9ca188fbce325 100644
--- a/llvm/test/CodeGen/AArch64/settag.ll
+++ b/llvm/test/CodeGen/AArch64/settag.ll
@@ -64,8 +64,8 @@ entry:
 define void @stg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stg17:
-; CHECK: stg x0, [x0], #16
 ; CHECK: mov  {{(w|x)}}[[R:[0-9]+]], #256
+; CHECK: stg x0, [x0], #16
 ; CHECK: st2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -87,8 +87,8 @@ entry:
 define void @stzg17(i8* %p) {
 entry:
 ; CHECK-LABEL: stzg17:
-; CHECK: stzg x0, [x0], #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
+; CHECK: stzg x0, [x0], #16
 ; CHECK: stz2g x0, [x0], #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
@@ -110,10 +110,10 @@ entry:
 define void @stg_alloca5() {
 entry:
 ; CHECK-LABEL: stg_alloca5:
-; CHECK:         st2g    sp, [sp, #32]
-; CHECK-NEXT:    stg     sp, [sp, #64]
-; CHECK-NEXT:    st2g    sp, [sp], #80
-; CHECK-NEXT:    ret
+; CHECK: stg  sp, [sp, #64]
+; CHECK: st2g sp, [sp, #32]
+; CHECK: st2g sp, [sp]
+; CHECK: ret
   %a = alloca i8, i32 80, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 80)
   ret void
@@ -122,11 +122,12 @@ entry:
 define void @stg_alloca17() {
 entry:
 ; CHECK-LABEL: stg_alloca17:
+; CHECK: mov [[P:x[0-9]+]], sp
+; CHECK: stg [[P]], {{\[}}[[P]]{{\]}}, #16
 ; CHECK: mov  {{w|x}}[[R:[0-9]+]], #256
-; CHECK: st2g sp, [sp], #32
+; CHECK: st2g [[P]], {{\[}}[[P]]{{\]}}, #32
 ; CHECK: sub  x[[R]], x[[R]], #32
 ; CHECK: cbnz x[[R]],
-; CHECK: stg sp, [sp], #16
 ; CHECK: ret
   %a = alloca i8, i32 272, align 16
   call void @llvm.aarch64.settag(i8* %a, i64 272)
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
index ed6ccc8b49413..200837dabfe0e 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-unchecked-ld-st.ll
@@ -210,10 +210,11 @@ entry:
 ; DEFAULT:  ldrb [[A:w.*]], [x{{.*}}]
 ; DEFAULT:  ldrb [[B:w.*]], [x{{.*}}]
 
-; ALWAYS-DAG: ldg [[PA:x.*]], [x{{.*}}]
-; ALWAYS-DAG: ldrb [[B:w.*]], [sp]
-; ALWAYS-DAG: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
+; ALWAYS: ldg [[PA:x.*]], [x{{.*}}]
+; ALWAYS: ldrb [[B:w.*]], [sp]
+; ALWAYS: ldrb [[A:w.*]], {{\[}}[[PA]]{{\]}}
 
+; COMMON: add w0, [[B]], [[A]]
 ; COMMON: ret
 
 ; One of these allocas is closer to FP than to SP, and within 256 bytes

From f84d3200528a4aa1e8c7f2d3a9567f63f855b165 Mon Sep 17 00:00:00 2001
From: Ahmed Taei <ataei@google.com>
Date: Wed, 8 Jan 2020 12:28:11 -0800
Subject: [PATCH 03/89] [MLIR] Don't use SSA names directly for std.view
 canonicalization test

Reviewers: rriddle, nicolasvasilache

Subscribers: mehdi_amini, jpienaar, burmako, shauheen, antiagainst, arpith-jacob, mgester, lucyrfox, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72408
---
 mlir/test/Transforms/canonicalize.mlir | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/mlir/test/Transforms/canonicalize.mlir b/mlir/test/Transforms/canonicalize.mlir
index a6c2326290752..16ee00923d258 100644
--- a/mlir/test/Transforms/canonicalize.mlir
+++ b/mlir/test/Transforms/canonicalize.mlir
@@ -703,28 +703,28 @@ func @view(%arg0 : index) {
   %c15 = constant 15 : index
 
   // Test: fold constant sizes and offset, update map with static stride/offset.
-  // CHECK: std.view %0[][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP0]]>
   %1 = view %0[%c15][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
   load %1[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant sizes but not offset, update map with static stride.
   // Test that we do not a fold dynamic dim which is not produced by a constant.
-  // CHECK: std.view %0[%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
+  // CHECK: std.view %[[ALLOC_MEM]][%arg0][] : memref<2048xi8> to memref<7x11xf32, #[[VIEW_MAP1]]>
   %2 = view %0[%arg0][%c7, %c11]
     : memref<2048xi8> to memref<?x?xf32, #TEST_VIEW_MAP0>
   load %2[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold constant offset but not sizes, update map with constant offset.
   // Test that we fold constant offset but not dynamic dims.
-  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?xf32, #[[VIEW_MAP2]]>
   %3 = view %0[%c15][%arg0, %arg0]
     : memref<2048xi8> to memref<?x?xf32,  #TEST_VIEW_MAP0>
   load %3[%c0, %c0] : memref<?x?xf32, #TEST_VIEW_MAP0>
 
   // Test: fold one constant dim, no offset, should update with constant
   // stride on dim 1, but leave dynamic stride on dim 0.
-  // CHECK: std.view %0[][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
+  // CHECK: std.view %[[ALLOC_MEM]][][%arg0, %arg0] : memref<2048xi8> to memref<?x?x7xf32, #[[VIEW_MAP3]]>
   %4 = view %0[][%arg0, %arg0, %c7]
     : memref<2048xi8> to memref<?x?x?xf32, #TEST_VIEW_MAP1>
   load %4[%c0, %c0, %c0] : memref<?x?x?xf32, #TEST_VIEW_MAP1>
@@ -736,7 +736,7 @@ func @view(%arg0 : index) {
   load %5[%c0, %c0] : memref<?x4xf32, #TEST_VIEW_MAP2>
 
   // Test: folding static alloc and memref_cast into a view.
-  // CHECK: std.view %0[][%c15, %c7] : memref<2048xi8> to memref<?x?xf32>
+  // CHECK: std.view %[[ALLOC_MEM]][][%c15, %c7] : memref<2048xi8> to memref<?x?xf32>
   %6 = memref_cast %0 : memref<2048xi8> to memref<?xi8>
   %7 = view %6[%c15][%c7] : memref<?xi8> to memref<?x?xf32>
   load %7[%c0, %c0] : memref<?x?xf32>

From 7f1026a7520eb3d7e99da77f6738deb72de17980 Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Wed, 8 Jan 2020 15:44:35 -0800
Subject: [PATCH 04/89] [cfi][test] cross-dso/stats.cpp: don't assume the order
 of static constructors

__sanitizer_stat_init is called for the executable first, then the
shared object. In WriterModuleReport(), the information for the shared
object will be recorded first. It'd be nice to get rid of the order
requirement of static constructors. (This should make .ctors platforms
work.)
---
 compiler-rt/test/cfi/cross-dso/stats.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/test/cfi/cross-dso/stats.cpp b/compiler-rt/test/cfi/cross-dso/stats.cpp
index 09a7217bf066a..9d8c2ee3e0d95 100644
--- a/compiler-rt/test/cfi/cross-dso/stats.cpp
+++ b/compiler-rt/test/cfi/cross-dso/stats.cpp
@@ -22,24 +22,24 @@ extern "C" void nvcall(A *a);
 #ifdef SHARED_LIB
 
 extern "C" __attribute__((noinline)) void vcall(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] vcall.cfi cfi-vcall 37
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] vcall.cfi cfi-vcall 37
   a->vf();
 }
 
 extern "C" __attribute__((noinline)) void nvcall(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] nvcall.cfi cfi-nvcall 51
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] nvcall.cfi cfi-nvcall 51
   a->nvf();
 }
 
 #else
 
 extern "C" __attribute__((noinline)) A *dcast(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] dcast.cfi cfi-derived-cast 24
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] dcast.cfi cfi-derived-cast 24
   return (A *)(ABase *)a;
 }
 
 extern "C" __attribute__((noinline)) A *ucast(A *a) {
-  // CHECK: stats.cpp:[[@LINE+1]] ucast.cfi cfi-unrelated-cast 81
+  // CHECK-DAG: stats.cpp:[[@LINE+1]] ucast.cfi cfi-unrelated-cast 81
   return (A *)(char *)a;
 }
 

From ae47a3d8107856c84c104f3c2e43a553f4e36748 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 8 Jan 2020 16:13:03 -0800
Subject: [PATCH 05/89] [lldb/SWIG] Refactor extensions to be non
 Python-specific (2/2)

The current SWIG extensions for the string conversion operator is Python
specific because it uses the PythonObjects. This means that the code
cannot be reused for other SWIG supported languages such as Lua.

This reimplements the extensions in a more generic way that can be
reused. It uses a SWIG macro to reduce code duplication.

Differential revision: https://reviews.llvm.org/D72377
---
 lldb/scripts/Python/python-extensions.swig    | 276 +-----------------
 lldb/scripts/interface/SBAddress.i            |   2 +
 lldb/scripts/interface/SBBlock.i              |   2 +
 lldb/scripts/interface/SBBreakpoint.i         |   2 +
 lldb/scripts/interface/SBBreakpointLocation.i |   2 +
 lldb/scripts/interface/SBBreakpointName.i     |   1 +
 .../scripts/interface/SBCommandReturnObject.i |   2 +
 lldb/scripts/interface/SBCompileUnit.i        |   2 +
 lldb/scripts/interface/SBData.i               |   2 +
 lldb/scripts/interface/SBDebugger.i           |   2 +
 lldb/scripts/interface/SBDeclaration.i        |   2 +
 lldb/scripts/interface/SBError.i              |   2 +
 lldb/scripts/interface/SBFileSpec.i           |   2 +
 lldb/scripts/interface/SBFrame.i              |   2 +
 lldb/scripts/interface/SBFunction.i           |   2 +
 lldb/scripts/interface/SBInstruction.i        |   2 +
 lldb/scripts/interface/SBInstructionList.i    |   2 +
 lldb/scripts/interface/SBLineEntry.i          |   2 +
 lldb/scripts/interface/SBMemoryRegionInfo.i   |   1 +
 lldb/scripts/interface/SBModule.i             |   2 +
 lldb/scripts/interface/SBModuleSpec.i         |   2 +
 lldb/scripts/interface/SBTarget.i             |  16 +-
 lldb/scripts/lldb.swig                        |  18 ++
 lldb/scripts/lldb_lua.swig                    |   1 +
 lldb/scripts/macros.swig                      |  33 +++
 25 files changed, 95 insertions(+), 287 deletions(-)
 create mode 100644 lldb/scripts/macros.swig

diff --git a/lldb/scripts/Python/python-extensions.swig b/lldb/scripts/Python/python-extensions.swig
index dbd4b1d79d005..36dac19644bd0 100644
--- a/lldb/scripts/Python/python-extensions.swig
+++ b/lldb/scripts/Python/python-extensions.swig
@@ -1,42 +1,4 @@
-%extend lldb::SBAddress {
-        %nothreadallow;
-        PyObject *lldb::SBAddress::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBBlock {
-        %nothreadallow;
-        PyObject *lldb::SBBlock::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
 %extend lldb::SBBreakpoint {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpoint::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -50,34 +12,6 @@
 
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
-
-}
-%extend lldb::SBBreakpointLocation {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpointLocation::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelFull);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBBreakpointName {
-        %nothreadallow;
-        PyObject *lldb::SBBreakpointName::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 }
 
 %extend lldb::SBBroadcaster {
@@ -97,18 +31,6 @@
 }
 
 %extend lldb::SBCommandReturnObject {
-        %nothreadallow;
-        PyObject *lldb::SBCommandReturnObject::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
         /* the write() and flush() calls are not part of the SB API proper, and are solely for Python usage
         they are meant to make an SBCommandReturnObject into a file-like object so that instructions of the sort
         print >>sb_command_return_object, "something"
@@ -122,18 +44,8 @@
         void lldb::SBCommandReturnObject::flush ()
         {}
 }
+
 %extend lldb::SBCompileUnit {
-        %nothreadallow;
-        PyObject *lldb::SBCompileUnit::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -148,45 +60,8 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
-%extend lldb::SBData {
-        %nothreadallow;
-        PyObject *lldb::SBData::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBDebugger {
-        %nothreadallow;
-        PyObject *lldb::SBDebugger::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBDeclaration {
-        %nothreadallow;
-        PyObject *lldb::SBDeclaration::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 
+%extend lldb::SBDeclaration {
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -200,60 +75,9 @@
 
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
-
-}
-%extend lldb::SBError {
-        %nothreadallow;
-        PyObject *lldb::SBError::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBFileSpec {
-        %nothreadallow;
-        PyObject *lldb::SBFileSpec::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 }
-%extend lldb::SBFrame {
-        %nothreadallow;
-        PyObject *lldb::SBFrame::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBFunction {
-        %nothreadallow;
-        PyObject *lldb::SBFunction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 
+%extend lldb::SBFunction {
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -267,47 +91,9 @@
 
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
-
-}
-%extend lldb::SBInstruction {
-        %nothreadallow;
-        PyObject *lldb::SBInstruction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBInstructionList {
-        %nothreadallow;
-        PyObject *lldb::SBInstructionList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 }
-%extend lldb::SBLineEntry {
-        %nothreadallow;
-        PyObject *lldb::SBLineEntry::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
 
+%extend lldb::SBLineEntry {
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -323,33 +109,7 @@
     %}
 }
 
-%extend lldb::SBMemoryRegionInfo {
-        %nothreadallow;
-        PyObject *lldb::SBMemoryRegionInfo::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
 %extend lldb::SBModule {
-        %nothreadallow;
-        PyObject *lldb::SBModule::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -365,34 +125,6 @@
     %}
 }
 
-%extend lldb::SBModuleSpec {
-        %nothreadallow;
-        PyObject *lldb::SBModuleSpec::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-%extend lldb::SBModuleSpecList {
-        %nothreadallow;
-        PyObject *lldb::SBModuleSpecList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
 %extend lldb::SBProcess {
         %nothreadallow;
         PyObject *lldb::SBProcess::__str__ (){
diff --git a/lldb/scripts/interface/SBAddress.i b/lldb/scripts/interface/SBAddress.i
index 6c5352bac6d7c..4658534d153ea 100644
--- a/lldb/scripts/interface/SBAddress.i
+++ b/lldb/scripts/interface/SBAddress.i
@@ -140,6 +140,8 @@ public:
     lldb::SBLineEntry
     GetLineEntry ();
 
+    STRING_EXTENSION(SBAddress)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_load_addr_property__ (self):
diff --git a/lldb/scripts/interface/SBBlock.i b/lldb/scripts/interface/SBBlock.i
index 73079a11760c6..8bd8e37953cfc 100644
--- a/lldb/scripts/interface/SBBlock.i
+++ b/lldb/scripts/interface/SBBlock.i
@@ -100,6 +100,8 @@ public:
                    bool locals,
                    bool statics);
 
+    STRING_EXTENSION(SBBlock)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_range_at_index(self, idx):
diff --git a/lldb/scripts/interface/SBBreakpoint.i b/lldb/scripts/interface/SBBreakpoint.i
index f84f2ada3d329..20354346be900 100644
--- a/lldb/scripts/interface/SBBreakpoint.i
+++ b/lldb/scripts/interface/SBBreakpoint.i
@@ -249,6 +249,8 @@ public:
     bool
     IsHardware ();
 
+    STRING_EXTENSION(SBBreakpoint)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBBreakpointLocation.i b/lldb/scripts/interface/SBBreakpointLocation.i
index 44fd42b514f7f..dc39c83c2d67b 100644
--- a/lldb/scripts/interface/SBBreakpointLocation.i
+++ b/lldb/scripts/interface/SBBreakpointLocation.i
@@ -134,6 +134,8 @@ public:
 
     SBBreakpoint
     GetBreakpoint ();
+
+    STRING_EXTENSION_LEVEL(SBBreakpointLocation, lldb::eDescriptionLevelFull)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBBreakpointName.i b/lldb/scripts/interface/SBBreakpointName.i
index 2a06d0a2105f0..e280d42245915 100644
--- a/lldb/scripts/interface/SBBreakpointName.i
+++ b/lldb/scripts/interface/SBBreakpointName.i
@@ -108,6 +108,7 @@ public:
 
   bool GetDescription(lldb::SBStream &description);
 
+  STRING_EXTENSION(SBBreakpointName)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBCommandReturnObject.i b/lldb/scripts/interface/SBCommandReturnObject.i
index 73d4001aaba59..affa16520f28d 100644
--- a/lldb/scripts/interface/SBCommandReturnObject.i
+++ b/lldb/scripts/interface/SBCommandReturnObject.i
@@ -96,6 +96,8 @@ public:
     void SetImmediateOutputFile(lldb::FileSP BORROWED);
     void SetImmediateErrorFile(lldb::FileSP BORROWED);
 
+    STRING_EXTENSION(SBCommandReturnObject)
+
     %extend {
         // transfer_ownership does nothing, and is here for compatibility with
         // old scripts.  Ownership is tracked by reference count in the ordinary way.
diff --git a/lldb/scripts/interface/SBCompileUnit.i b/lldb/scripts/interface/SBCompileUnit.i
index bc2d45ae8e56c..d6a4c07038c65 100644
--- a/lldb/scripts/interface/SBCompileUnit.i
+++ b/lldb/scripts/interface/SBCompileUnit.i
@@ -116,6 +116,8 @@ public:
     bool
     operator != (const lldb::SBCompileUnit &rhs) const;
 
+    STRING_EXTENSION(SBCompileUnit)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBData.i b/lldb/scripts/interface/SBData.i
index fdaa6962f0eca..3e74240329e05 100644
--- a/lldb/scripts/interface/SBData.i
+++ b/lldb/scripts/interface/SBData.i
@@ -134,6 +134,8 @@ public:
     bool
     SetDataFromDoubleArray (double* array, size_t array_len);
 
+    STRING_EXTENSION(SBData)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBDebugger.i b/lldb/scripts/interface/SBDebugger.i
index 52f65841893c6..f2e23a7ed7804 100644
--- a/lldb/scripts/interface/SBDebugger.i
+++ b/lldb/scripts/interface/SBDebugger.i
@@ -479,6 +479,8 @@ public:
     lldb::SBTypeSynthetic
     GetSyntheticForType (lldb::SBTypeNameSpecifier);
 
+    STRING_EXTENSION(SBDebugger)
+
     %feature("docstring",
 "Launch a command interpreter session. Commands are read from standard input or
 from the input handle specified for the debugger object. Output/errors are
diff --git a/lldb/scripts/interface/SBDeclaration.i b/lldb/scripts/interface/SBDeclaration.i
index cdaec85676461..621c1a0ab7c87 100644
--- a/lldb/scripts/interface/SBDeclaration.i
+++ b/lldb/scripts/interface/SBDeclaration.i
@@ -53,6 +53,8 @@ namespace lldb {
         bool
         operator != (const lldb::SBDeclaration &rhs) const;
 
+        STRING_EXTENSION(SBDeclaration)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             file = property(GetFileSpec, None, doc='''A read only property that returns an lldb object that represents the file (lldb.SBFileSpec) for this line entry.''')
diff --git a/lldb/scripts/interface/SBError.i b/lldb/scripts/interface/SBError.i
index 96cd6c4886f5f..ea48e2263a77a 100644
--- a/lldb/scripts/interface/SBError.i
+++ b/lldb/scripts/interface/SBError.i
@@ -105,6 +105,8 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBError)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         value = property(GetError, None, doc='''A read only property that returns the same result as GetError().''')
diff --git a/lldb/scripts/interface/SBFileSpec.i b/lldb/scripts/interface/SBFileSpec.i
index 07a7630ebbac8..d287a940c051a 100644
--- a/lldb/scripts/interface/SBFileSpec.i
+++ b/lldb/scripts/interface/SBFileSpec.i
@@ -80,6 +80,8 @@ public:
     void
     AppendPathComponent (const char *file_or_directory);
 
+    STRING_EXTENSION(SBFileSpec)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_fullpath__(self):
diff --git a/lldb/scripts/interface/SBFrame.i b/lldb/scripts/interface/SBFrame.i
index 811f7f22f9b4d..c65b88f863e7d 100644
--- a/lldb/scripts/interface/SBFrame.i
+++ b/lldb/scripts/interface/SBFrame.i
@@ -285,6 +285,8 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBFrame)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_all_variables(self):
diff --git a/lldb/scripts/interface/SBFunction.i b/lldb/scripts/interface/SBFunction.i
index 7b157bb388169..630c4db22c55d 100644
--- a/lldb/scripts/interface/SBFunction.i
+++ b/lldb/scripts/interface/SBFunction.i
@@ -111,6 +111,8 @@ public:
     bool
     operator != (const lldb::SBFunction &rhs) const;
 
+    STRING_EXTENSION(SBFunction)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_instructions_from_current_target (self):
diff --git a/lldb/scripts/interface/SBInstruction.i b/lldb/scripts/interface/SBInstruction.i
index 09688214630b1..d50a080fd0454 100644
--- a/lldb/scripts/interface/SBInstruction.i
+++ b/lldb/scripts/interface/SBInstruction.i
@@ -74,6 +74,8 @@ public:
     bool
     TestEmulation (lldb::SBStream &output_stream, const char *test_file);
 
+    STRING_EXTENSION(SBInstruction)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __mnemonic_property__ (self):
diff --git a/lldb/scripts/interface/SBInstructionList.i b/lldb/scripts/interface/SBInstructionList.i
index d50deba4f5e1f..1357323027573 100644
--- a/lldb/scripts/interface/SBInstructionList.i
+++ b/lldb/scripts/interface/SBInstructionList.i
@@ -66,6 +66,8 @@ public:
     bool
     DumpEmulationForAllInstructions (const char *triple);
 
+    STRING_EXTENSION(SBInstructionList)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBLineEntry.i b/lldb/scripts/interface/SBLineEntry.i
index 90f60df23247d..be365377ba8b0 100644
--- a/lldb/scripts/interface/SBLineEntry.i
+++ b/lldb/scripts/interface/SBLineEntry.i
@@ -84,6 +84,8 @@ public:
     bool
     operator != (const lldb::SBLineEntry &rhs) const;
 
+    STRING_EXTENSION(SBLineEntry)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         file = property(GetFileSpec, None, doc='''A read only property that returns an lldb object that represents the file (lldb.SBFileSpec) for this line entry.''')
diff --git a/lldb/scripts/interface/SBMemoryRegionInfo.i b/lldb/scripts/interface/SBMemoryRegionInfo.i
index 7a59d0051ceac..6a2ad6a3e3649 100644
--- a/lldb/scripts/interface/SBMemoryRegionInfo.i
+++ b/lldb/scripts/interface/SBMemoryRegionInfo.i
@@ -55,6 +55,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBMemoryRegionInfo)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBModule.i b/lldb/scripts/interface/SBModule.i
index 03c8aeb2bed9e..a9d9480cd7cf1 100644
--- a/lldb/scripts/interface/SBModule.i
+++ b/lldb/scripts/interface/SBModule.i
@@ -344,6 +344,8 @@ public:
     lldb::SBAddress
     GetObjectFileEntryPointAddress() const;
 
+    STRING_EXTENSION(SBModule)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __len__(self):
diff --git a/lldb/scripts/interface/SBModuleSpec.i b/lldb/scripts/interface/SBModuleSpec.i
index ec4e9bb7fbf72..64d0aa641a774 100644
--- a/lldb/scripts/interface/SBModuleSpec.i
+++ b/lldb/scripts/interface/SBModuleSpec.i
@@ -91,6 +91,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBModuleSpec)
 };
 
 
@@ -127,6 +128,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBModuleSpecList)
 };
 
 } // namespace lldb
diff --git a/lldb/scripts/interface/SBTarget.i b/lldb/scripts/interface/SBTarget.i
index 02c70b6e1cd6f..371bf5c35ebd0 100644
--- a/lldb/scripts/interface/SBTarget.i
+++ b/lldb/scripts/interface/SBTarget.i
@@ -967,21 +967,7 @@ public:
     lldb::SBValue
     EvaluateExpression (const char *expr, const lldb::SBExpressionOptions &options);
 
-  %extend {
-    %nothreadallow;
-    std::string lldb::SBTarget::__str__(){
-      lldb::SBStream stream;
-      $self->GetDescription (stream, lldb::eDescriptionLevelBrief);
-
-      const char *desc = stream.GetData();
-      size_t desc_len = stream.GetSize();
-      if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-        --desc_len;
-
-      return std::string(desc, desc_len);
-    }
-    %clearnothreadallow;
-  }
+    STRING_EXTENSION_LEVEL(SBTarget, lldb::eDescriptionLevelBrief)
 
 #ifdef SWIGPYTHON
     %pythoncode %{
diff --git a/lldb/scripts/lldb.swig b/lldb/scripts/lldb.swig
index c3b9083327410..bebf9bf534a34 100644
--- a/lldb/scripts/lldb.swig
+++ b/lldb/scripts/lldb.swig
@@ -59,6 +59,23 @@ except ImportError:
 // Parameter types will be used in the autodoc string.
 %feature("autodoc", "1");
 
+%define ARRAYHELPER(type,name)
+%inline %{
+type *new_ ## name (int nitems) {
+   return (type *) malloc(sizeof(type)*nitems);
+}
+void delete_ ## name(type *t) {
+   free(t);
+}
+type name ## _get(type *t, int index) {
+   return t[index];
+}
+void name ## _set(type *t, int index, type val) {
+   t[index] = val;
+}
+%}
+%enddef
+
 %pythoncode%{
 import uuid
 import re
@@ -95,6 +112,7 @@ def lldb_iter(obj, getsize, getelem):
 
 %include <std_string.i>
 %include "./Python/python-typemaps.swig"
+%include "./macros.swig"
 %include "./headers.swig"
 
 %{
diff --git a/lldb/scripts/lldb_lua.swig b/lldb/scripts/lldb_lua.swig
index bf8809015d9a3..3b279a6b69e7f 100644
--- a/lldb/scripts/lldb_lua.swig
+++ b/lldb/scripts/lldb_lua.swig
@@ -9,6 +9,7 @@
 %module lldb
 
 %include <std_string.i>
+%include "./macros.swig"
 %include "./headers.swig"
 
 %{
diff --git a/lldb/scripts/macros.swig b/lldb/scripts/macros.swig
new file mode 100644
index 0000000000000..e0756c2f17932
--- /dev/null
+++ b/lldb/scripts/macros.swig
@@ -0,0 +1,33 @@
+%define STRING_EXTENSION_LEVEL(Class, Level)
+%extend {
+  %nothreadallow;
+  std::string lldb:: ## Class ## ::__str__(){
+    lldb::SBStream stream;
+    $self->GetDescription (stream, Level);
+    const char *desc = stream.GetData();
+    size_t desc_len = stream.GetSize();
+    if (desc_len > 0 && (desc[desc_len-1] == 'n' || desc[desc_len-1] == 'r')) {
+      --desc_len;
+    }
+    return std::string(desc, desc_len);
+  }
+  %clearnothreadallow;
+}
+%enddef
+
+%define STRING_EXTENSION(Class)
+%extend {
+  %nothreadallow;
+  std::string lldb:: ## Class ## ::__str__(){
+    lldb::SBStream stream;
+    $self->GetDescription (stream);
+    const char *desc = stream.GetData();
+    size_t desc_len = stream.GetSize();
+    if (desc_len > 0 && (desc[desc_len-1] == 'n' || desc[desc_len-1] == 'r')) {
+      --desc_len;
+    }
+    return std::string(desc, desc_len);
+  }
+  %clearnothreadallow;
+}
+%enddef

From ee57469a5128a5cd4f8b13945c873d3421a4b3d0 Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Wed, 11 Dec 2019 21:45:53 +0100
Subject: [PATCH 06/89] [X86]  Remove EFLAGS from live-in lists in
 X86FlagsCopyLowering.

When EFLAGS is no longer live into a basic block, remove it from the live-in
list.

Fixes https://bugs.llvm.org/show_bug.cgi?id=44462.

Review: Craig Topper

Differential Revision: https://reviews.llvm.org/D71375
---
 llvm/lib/Target/X86/X86FlagsCopyLowering.cpp  |  3 +
 .../CodeGen/X86/copy-eflags-liveinlists.mir   | 92 +++++++++++++++++++
 2 files changed, 95 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir

diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index 0915a1532df9e..b1d2de29c8965 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -702,6 +702,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           }
 
           Blocks.push_back(SuccMBB);
+
+          // After this, EFLAGS will be recreated before each use.
+          SuccMBB->removeLiveIn(X86::EFLAGS);
         }
     } while (!Blocks.empty());
 
diff --git a/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir b/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir
new file mode 100644
index 0000000000000..54454fe0017f1
--- /dev/null
+++ b/llvm/test/CodeGen/X86/copy-eflags-liveinlists.mir
@@ -0,0 +1,92 @@
+# RUN: llc -mtriple=i686-unknown-unknown -run-pass=x86-flags-copy-lowering \
+# RUN:  -print-after=x86-flags-copy-lowering %s -o - | FileCheck %s
+#
+# Check that $eflags is removed from live-in lists of successor blocks.
+#
+# CHECK-NOT: liveins: $eflags
+
+--- |
+  define void @fun(i16 %arg, i64 %arg1, i8 %arg2, i8* %arg3, i32 %arg4) { ret void}
+...
+---
+name:            fun
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr16 }
+  - { id: 1, class: gr16 }
+  - { id: 2, class: gr16 }
+  - { id: 3, class: gr32 }
+  - { id: 4, class: gr32 }
+  - { id: 5, class: gr8 }
+  - { id: 6, class: gr32 }
+  - { id: 7, class: gr32 }
+  - { id: 8, class: gr32 }
+  - { id: 9, class: gr32 }
+  - { id: 10, class: gr32 }
+  - { id: 11, class: gr32 }
+  - { id: 12, class: gr32 }
+  - { id: 13, class: gr8 }
+  - { id: 14, class: gr32 }
+  - { id: 15, class: gr32 }
+  - { id: 16, class: gr32_abcd }
+  - { id: 17, class: gr8 }
+  - { id: 18, class: gr8 }
+  - { id: 19, class: gr32 }
+  - { id: 20, class: gr32 }
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, offset: 20, size: 4, alignment: 4, isImmutable: true }
+  - { id: 1, offset: 16, size: 4, alignment: 4, isImmutable: true }
+  - { id: 2, offset: 12, size: 1, alignment: 4, isImmutable: true }
+  - { id: 3, offset: 8, size: 4, alignment: 4, isImmutable: true }
+  - { id: 4, offset: 4, size: 4, alignment: 4, isImmutable: true }
+  - { id: 5, size: 2, alignment: 4, isImmutable: true }
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    %4:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.3)
+    %3:gr32 = MOV32rm %fixed-stack.4, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.4)
+    %7:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.0)
+    %6:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load 4 from %fixed-stack.1)
+    %5:gr8 = MOV8rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load 1 from %fixed-stack.2, align 4)
+    %9:gr32 = IMPLICIT_DEF
+    %11:gr32 = IMPLICIT_DEF
+  
+  bb.1:
+    successors: %bb.2, %bb.3
+  
+    CMP32rr %3, %9, implicit-def $eflags
+    %10:gr32 = SBB32rr %4, %11, implicit-def $eflags, implicit $eflags
+    %12:gr32 = COPY $eflags
+    %13:gr8 = SETCCr 12, implicit $eflags
+    %14:gr32 = MOVZX32rr8 killed %13
+    %15:gr32 = NEG32r %14, implicit-def dead $eflags
+    %16:gr32_abcd = MOV32r0 implicit-def dead $eflags
+    $eflags = COPY %12
+    %17:gr8 = COPY %16.sub_8bit
+    JCC_1 %bb.3, 12, implicit $eflags
+  
+  bb.2:
+    liveins: $eflags
+  
+  
+  bb.3:
+    successors: %bb.4, %bb.5
+    liveins: $eflags
+  
+    %18:gr8 = PHI %5, %bb.2, %17, %bb.1
+    MOV8mr %6, 1, $noreg, 0, $noreg, killed %18 :: (volatile store 1 into %ir.arg3)
+    JCC_1 %bb.5, 12, implicit $eflags
+  
+  bb.4:
+  
+  bb.5:
+    %19:gr32 = PHI %16, %bb.4, %15, %bb.3
+    $eax = COPY %7
+    CDQ implicit-def $eax, implicit-def $edx, implicit $eax
+    IDIV32r killed %19, implicit-def dead $eax, implicit-def $edx, implicit-def dead $eflags, implicit $eax, implicit $edx
+    JMP_1 %bb.1
+
+...

From b2fb6a7ba118f651cc76580ecb48eb5f877920aa Mon Sep 17 00:00:00 2001
From: Gabor Horvath <xazax@google.com>
Date: Wed, 8 Jan 2020 16:37:46 -0800
Subject: [PATCH 07/89] [NFC] Whitespace fixes

---
 clang/include/clang/Basic/Attr.td     | 4 ++--
 clang/include/clang/Basic/AttrDocs.td | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index d9ca121b6510f..c992d6459f0c0 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -178,8 +178,8 @@ class FunctionArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
                                                                           opt,
                                                                           fake>;
 class NamedArgument<string name, bit opt = 0, bit fake = 0> : Argument<name,
-                                                                          opt,
-                                                                          fake>;
+                                                                       opt,
+                                                                       fake>;
 class TypeArgument<string name, bit opt = 0> : Argument<name, opt>;
 class UnsignedArgument<string name, bit opt = 0> : Argument<name, opt>;
 class VariadicUnsignedArgument<string name> : Argument<name, 1>;
diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 515476df3fddc..70bf2517cdcba 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -4378,7 +4378,7 @@ below. The explicit attribute annotation indicates that the third parameter
 (`start_routine`) is called zero or more times by the `pthread_create` function,
 and that the fourth parameter (`arg`) is passed along. Note that the callback
 behavior of `pthread_create` is automatically recognized by Clang. In addition,
-the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for 
+the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for
 `#pragma omp target teams` and `#pragma omp parallel`, respectively, are also
 automatically recognized as broker functions. Further functions might be added
 in the future.

From 85ee4ff4e474788abc7abdc939114281c2a68ec1 Mon Sep 17 00:00:00 2001
From: Billy Robert O'Neal III <bion@microsoft.com>
Date: Wed, 8 Jan 2020 16:46:55 -0800
Subject: [PATCH 08/89] [libcxx] [test] Disable refwrap/weak_result.pass.cpp in
 C++20 mode (broken by P0357R3)

---
 .../std/utilities/function.objects/refwrap/weak_result.pass.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp b/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
index a3577ff064275..848858d33b19f 100644
--- a/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
+++ b/libcxx/test/std/utilities/function.objects/refwrap/weak_result.pass.cpp
@@ -12,6 +12,8 @@
 
 // has weak result type
 
+// REQUIRES: c++98 || c++03 || c++11 || c++14 || c++17
+
 #include <functional>
 #include <type_traits>
 

From 659efa21f1b3eded304b7239b356eecf1f5034ae Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulsson@linux.vnet.ibm.com>
Date: Tue, 19 Nov 2019 12:34:49 +0100
Subject: [PATCH 09/89] Recommit "[MachineVerifier] Improve verification of
 live-in lists."

MachineVerifier::visitMachineFunctionAfter() is extended to check the
live-through case for live-in lists. This is only done for registers without
aliases and that are neither allocatable or reserved, such as the SystemZ::CC
register.

The MachineVerifier earlier only catched the case of a live-in use without an
entry in the live-in list (as "using an undefined physical register").

A comment in LivePhysRegs.h has been added stating a guarantee that
addLiveOuts() can be trusted for a full register both before and after
register allocation.

Review: Quentin Colombet

Differential Revision: https://reviews.llvm.org/D68267
---
 llvm/include/llvm/CodeGen/LivePhysRegs.h  |  3 ++
 llvm/lib/CodeGen/MachineVerifier.cpp      | 26 +++++++++++
 llvm/test/MachineVerifier/live-ins-01.mir | 57 +++++++++++++++++++++++
 llvm/test/MachineVerifier/live-ins-02.mir | 32 +++++++++++++
 llvm/test/MachineVerifier/live-ins-03.mir | 36 ++++++++++++++
 5 files changed, 154 insertions(+)
 create mode 100644 llvm/test/MachineVerifier/live-ins-01.mir
 create mode 100644 llvm/test/MachineVerifier/live-ins-02.mir
 create mode 100644 llvm/test/MachineVerifier/live-ins-03.mir

diff --git a/llvm/include/llvm/CodeGen/LivePhysRegs.h b/llvm/include/llvm/CodeGen/LivePhysRegs.h
index 50da0b3d5c483..085893462a083 100644
--- a/llvm/include/llvm/CodeGen/LivePhysRegs.h
+++ b/llvm/include/llvm/CodeGen/LivePhysRegs.h
@@ -137,6 +137,9 @@ class LivePhysRegs {
   /// Live out registers are the union of the live-in registers of the successor
   /// blocks and pristine registers. Live out registers of the end block are the
   /// callee saved registers.
+  /// If a register is not added by this method, it is guaranteed to not be
+  /// live out from MBB, although a sub-register may be. This is true
+  /// both before and after regalloc.
   void addLiveOuts(const MachineBasicBlock &MBB);
 
   /// Adds all live-out registers of basic block \p MBB but skips pristine
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index ca57e51268e88..b2534c2e53d4a 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -2312,6 +2312,32 @@ void MachineVerifier::visitMachineFunctionAfter() {
   if (LiveInts)
     verifyLiveIntervals();
 
+  // Check live-in list of each MBB. If a register is live into MBB, check
+  // that the register is in regsLiveOut of each predecessor block. Since
+  // this must come from a definition in the predecesssor or its live-in
+  // list, this will catch a live-through case where the predecessor does not
+  // have the register in its live-in list.  This currently only checks
+  // registers that have no aliases, are not allocatable and are not
+  // reserved, which could mean a condition code register for instance.
+  if (MRI->tracksLiveness())
+    for (const auto &MBB : *MF)
+      for (MachineBasicBlock::RegisterMaskPair P : MBB.liveins()) {
+        MCPhysReg LiveInReg = P.PhysReg;
+        bool hasAliases = MCRegAliasIterator(LiveInReg, TRI, false).isValid();
+        if (hasAliases || isAllocatable(LiveInReg) || isReserved(LiveInReg))
+          continue;
+        for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+          BBInfo &PInfo = MBBInfoMap[Pred];
+          if (!PInfo.regsLiveOut.count(LiveInReg)) {
+            report("Live in register not found to be live out from predecessor.",
+                   &MBB);
+            errs() << TRI->getName(LiveInReg)
+                   << " not found to be live out from "
+                   << printMBBReference(*Pred) << "\n";
+          }
+        }
+      }
+
   for (auto CSInfo : MF->getCallSitesInfo())
     if (!CSInfo.first->isCall())
       report("Call site info referencing instruction that is not call", MF);
diff --git a/llvm/test/MachineVerifier/live-ins-01.mir b/llvm/test/MachineVerifier/live-ins-01.mir
new file mode 100644
index 0000000000000..51c05dacf0558
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-01.mir
@@ -0,0 +1,57 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f2
+# CHECK:- basic block: %bb.2
+# CHECK:CC not found to be live out from %bb.1
+---
+name:            f2
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+    KILL killed $cc
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+
+...
diff --git a/llvm/test/MachineVerifier/live-ins-02.mir b/llvm/test/MachineVerifier/live-ins-02.mir
new file mode 100644
index 0000000000000..d76325cdd1082
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-02.mir
@@ -0,0 +1,32 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+
+  bb.2:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f1
+# CHECK:- basic block: %bb.2
+# CHECK:CC not found to be live out from %bb.1
diff --git a/llvm/test/MachineVerifier/live-ins-03.mir b/llvm/test/MachineVerifier/live-ins-03.mir
new file mode 100644
index 0000000000000..b5345ccdc3b63
--- /dev/null
+++ b/llvm/test/MachineVerifier/live-ins-03.mir
@@ -0,0 +1,36 @@
+# RUN: not llc -o - %s -mtriple=s390x-linux-gnu -mcpu=z14 -run-pass none 2>&1 | FileCheck %s
+# REQUIRES: systemz-registered-target
+
+# Test that a the machine verifier reports an error when a register in
+# liveins is not liveout from predecessor.
+
+---
+name:            f1
+tracksRegLiveness: true
+machineFunctionInfo: {}
+body:             |
+  bb.0:
+    liveins: $r2l, $r3l
+  
+    %1:gr32bit = COPY $r3l
+    %0:gr32bit = COPY $r2l
+    CHIMux %0, 0, implicit-def $cc
+
+  bb.1:
+    liveins: $cc
+    BRC 14, 8, %bb.3, implicit $cc
+
+  bb.2:
+
+  bb.3:
+    liveins: $cc
+
+    %2:grx32bit = LOCRMux %1, %0, 14, 8, implicit $cc
+    $r2l = COPY %2
+    Return implicit $r2l
+...
+
+# CHECK: *** Bad machine code: Live in register not found to be live out from predecessor. ***
+# CHECK:- function:    f1
+# CHECK:- basic block: %bb.3
+# CHECK:CC not found to be live out from %bb.2

From ff0311c4b3b9dce9d25d08e38aa163682b155513 Mon Sep 17 00:00:00 2001
From: Justin Hibbits <chmeeedalf@gmail.com>
Date: Mon, 30 Dec 2019 15:09:39 -0600
Subject: [PATCH 10/89] [PowerPC]: Add powerpcspe target triple subarch
 component

Summary:
This allows the use of '-target powerpcspe-unknown-linux-gnu' or
'powerpcspe-unknown-freebsd' to be used, instead of
'-target powerpc-unknown-linux-gnu -mspe'.

Reviewed By: dim
Differential Revision: https://reviews.llvm.org/D72014
---
 clang/lib/Basic/Targets/PPC.cpp          | 3 ++-
 clang/lib/Basic/Targets/PPC.h            | 3 +--
 clang/test/Preprocessor/init.c           | 3 ++-
 llvm/include/llvm/ADT/Triple.h           | 4 +++-
 llvm/lib/Support/Triple.cpp              | 5 ++++-
 llvm/lib/Target/PowerPC/PPCSubtarget.cpp | 3 +++
 llvm/unittests/ADT/TripleTest.cpp        | 7 +++++++
 7 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Basic/Targets/PPC.cpp b/clang/lib/Basic/Targets/PPC.cpp
index 1877d4a5ef70b..bc0ffb7fa440a 100644
--- a/clang/lib/Basic/Targets/PPC.cpp
+++ b/clang/lib/Basic/Targets/PPC.cpp
@@ -316,7 +316,8 @@ bool PPCTargetInfo::initFeatureMap(
                         .Case("pwr8", true)
                         .Default(false);
 
-  Features["spe"] = llvm::StringSwitch<bool>(CPU)
+  Features["spe"] = getTriple().getSubArch() == llvm::Triple::PPCSubArch_spe ||
+                    llvm::StringSwitch<bool>(CPU)
                         .Case("8548", true)
                         .Case("e500", true)
                         .Default(false);
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 3076025fc5b2b..270aa7ff91815 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -87,8 +87,7 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
 
   // Note: GCC recognizes the following additional cpus:
   //  401, 403, 405, 405fp, 440fp, 464, 464fp, 476, 476fp, 505, 740, 801,
-  //  821, 823, 8540, 8548, e300c2, e300c3, e500mc64, e6500, 860, cell,
-  //  titan, rs64.
+  //  821, 823, 8540, e300c2, e300c3, e500mc64, e6500, 860, cell, titan, rs64.
   bool isValidCPUName(StringRef Name) const override;
   void fillValidCPUList(SmallVectorImpl<StringRef> &Values) const override;
 
diff --git a/clang/test/Preprocessor/init.c b/clang/test/Preprocessor/init.c
index 4d8c6e5c46b63..a03725889360e 100644
--- a/clang/test/Preprocessor/init.c
+++ b/clang/test/Preprocessor/init.c
@@ -6551,10 +6551,11 @@
 // PPC32-LINUX-NOT: _CALL_LINUX
 //
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-feature +spe < /dev/null | FileCheck -match-full-lines -check-prefix PPC32-SPE %s
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpcspe-unknown-linux-gnu < /dev/null | FileCheck -match-full-lines -check-prefix PPC32-SPE %s
 //
 // PPC32-SPE:#define __NO_FPRS__ 1
 // PPC32-SPE:#define __SPE__ 1
-// 
+//
 // RUN: %clang_cc1 -E -dM -ffreestanding -triple=powerpc-unknown-linux-gnu -target-cpu 8548 < /dev/null | FileCheck -match-full-lines -check-prefix PPC8548 %s
 //
 // PPC8548:#define __NO_FPRS__ 1
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index a2cae37542803..88a86bfa5ff3a 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -128,7 +128,9 @@ class Triple {
     KalimbaSubArch_v4,
     KalimbaSubArch_v5,
 
-    MipsSubArch_r6
+    MipsSubArch_r6,
+
+    PPCSubArch_spe
   };
   enum VendorType {
     UnknownVendor,
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 5bf93638a219e..f2debc443d240 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -387,7 +387,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     // FIXME: Do we need to support these?
     .Cases("i786", "i886", "i986", Triple::x86)
     .Cases("amd64", "x86_64", "x86_64h", Triple::x86_64)
-    .Cases("powerpc", "ppc", "ppc32", Triple::ppc)
+    .Cases("powerpc", "powerpcspe", "ppc", "ppc32", Triple::ppc)
     .Cases("powerpc64", "ppu", "ppc64", Triple::ppc64)
     .Cases("powerpc64le", "ppc64le", Triple::ppc64le)
     .Case("xscale", Triple::arm)
@@ -559,6 +559,9 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
       (SubArchName.endswith("r6el") || SubArchName.endswith("r6")))
     return Triple::MipsSubArch_r6;
 
+  if (SubArchName == "powerpcspe")
+    return Triple::PPCSubArch_spe;
+
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
 
   // For now, this is the small part. Early return.
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index dc19cb0ac3093..77122e62dd5fb 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -151,6 +151,9 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       TargetTriple.isMusl())
     SecurePlt = true;
 
+  if (TargetTriple.getSubArch() == Triple::PPCSubArch_spe)
+    HasSPE = true;
+
   if (HasSPE && IsPPC64)
     report_fatal_error( "SPE is only supported for 32-bit targets.\n", false);
   if (HasSPE && (HasAltivec || HasQPX || HasVSX || HasFPU))
diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp
index d8123bbbfdf7a..c7f40dd1ca143 100644
--- a/llvm/unittests/ADT/TripleTest.cpp
+++ b/llvm/unittests/ADT/TripleTest.cpp
@@ -163,6 +163,13 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("powerpcspe-unknown-freebsd");
+  EXPECT_EQ(Triple::ppc, T.getArch());
+  EXPECT_EQ(Triple::PPCSubArch_spe, T.getSubArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::FreeBSD, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("arm-none-none-eabi");
   EXPECT_EQ(Triple::arm, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());

From 76aab66d34446ccf764cf8127b73e1517df75fb4 Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 8 Jan 2020 17:11:23 -0800
Subject: [PATCH 11/89] [NFC][InlineCost] Factor cost modeling out of
 CallAnalyzer traversal.

Summary:
The goal is to simplify experimentation on the cost model. Today,
CallAnalyzer decides 2 things: legality, and benefit. The refactoring
keeps legality assessment in CallAnalyzer, and factors benefit
evaluation out, as an extension.

Reviewers: davidxl, eraman

Subscribers: kamleshbhalui, fedor.sergeev, hiraditya, baloghadamsoftware, haicheng, a.sidorin, Szelethus, donat.nagy, dkrupp, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D71733
---
 llvm/lib/Analysis/InlineCost.cpp | 752 +++++++++++++++++--------------
 1 file changed, 422 insertions(+), 330 deletions(-)

diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index b5f4192bf8562..017301fce9b70 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -93,11 +93,12 @@ static cl::opt<bool> OptComputeFullInlineCost(
              "exceeds the threshold."));
 
 namespace {
-
+class InlineCostCallAnalyzer;
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   typedef InstVisitor<CallAnalyzer, bool> Base;
   friend class InstVisitor<CallAnalyzer, bool>;
 
+protected:
   /// The TargetTransformInfo available for this compilation.
   const TargetTransformInfo &TTI;
 
@@ -124,20 +125,86 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// easily cacheable. Instead, use the cover function paramHasAttr.
   CallBase &CandidateCall;
 
-  /// Tunable parameters that control the analysis.
-  const InlineParams &Params;
+  /// Extension points for handling callsite features.
+  /// Called after a basic block was analyzed.
+  virtual void onBlockAnalyzed(const BasicBlock *BB) {}
 
-  /// Upper bound for the inlining cost. Bonuses are being applied to account
-  /// for speculative "expected profit" of the inlining decision.
-  int Threshold;
+  /// Called at the end of the analysis of the callsite. Return the outcome of
+  /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
+  /// the reason it can't.
+  virtual InlineResult finalizeAnalysis() { return true; }
 
-  /// Inlining cost measured in abstract units, accounts for all the
-  /// instructions expected to be executed for a given function invocation.
-  /// Instructions that are statically proven to be dead based on call-site
-  /// arguments are not counted here.
-  int Cost = 0;
+  /// Called when we're about to start processing a basic block, and every time
+  /// we are done processing an instruction. Return true if there is no point in
+  /// continuing the analysis (e.g. we've determined already the call site is
+  /// too expensive to inline)
+  virtual bool shouldStop() { return false; }
+
+  /// Called before the analysis of the callee body starts (with callsite
+  /// contexts propagated).  It checks callsite-specific information. Return a
+  /// reason analysis can't continue if that's the case, or 'true' if it may
+  /// continue.
+  virtual InlineResult onAnalysisStart() { return true; }
+
+  /// Called if the analysis engine decides SROA cannot be done for the given
+  /// alloca.
+  virtual void onDisableSROA(AllocaInst *Arg) {}
+
+  /// Called the analysis engine determines load elimination won't happen.
+  virtual void onDisableLoadElimination() {}
+
+  /// Called to account for a call.
+  virtual void onCallPenalty() {}
+
+  /// Called to account for the expectation the inlining would result in a load
+  /// elimination.
+  virtual void onLoadEliminationOpportunity() {}
+
+  /// Called to account for the cost of argument setup for the Call in the
+  /// callee's body (not the callsite currently under analysis).
+  virtual void onCallArgumentSetup(const CallBase &Call) {}
+
+  /// Called to account for a load relative intrinsic.
+  virtual void onLoadRelativeIntrinsic() {}
+
+  /// Called to account for a lowered call.
+  virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
+  }
 
-  bool ComputeFullInlineCost;
+  /// Account for a jump table of given size. Return false to stop further
+  /// processing the switch instruction
+  virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
+
+  /// Account for a case cluster of given size. Return false to stop further
+  /// processing of the instruction.
+  virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
+
+  /// Called at the end of processing a switch instruction, with the given
+  /// number of case clusters.
+  virtual void onFinalizeSwitch(unsigned JumpTableSize,
+                                unsigned NumCaseCluster) {}
+
+  /// Called to account for any other instruction not specifically accounted
+  /// for.
+  virtual void onCommonInstructionSimplification() {}
+
+  /// Start accounting potential benefits due to SROA for the given alloca.
+  virtual void onInitializeSROAArg(AllocaInst *Arg) {}
+
+  /// Account SROA savings for the AllocaInst value.
+  virtual void onAggregateSROAUse(AllocaInst *V) {}
+
+  bool handleSROA(Value *V, bool DoNotDisable) {
+    // Check for SROA candidates in comparisons.
+    if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
+      if (DoNotDisable) {
+        onAggregateSROAUse(SROAArg);
+        return true;
+      }
+      disableSROA(SROAArg);
+    }
+    return false;
+  }
 
   bool IsCallerRecursive = false;
   bool IsRecursiveCall = false;
@@ -149,20 +216,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool HasUninlineableIntrinsic = false;
   bool InitsVargArgs = false;
 
-  /// Attempt to evaluate indirect calls to boost its inline cost.
-  bool BoostIndirectCalls;
-
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize = 0;
   unsigned NumInstructions = 0;
   unsigned NumVectorInstructions = 0;
 
-  /// Bonus to be applied when percentage of vector instructions in callee is
-  /// high (see more details in updateThreshold).
-  int VectorBonus = 0;
-  /// Bonus to be applied when the callee has only one reachable basic block.
-  int SingleBBBonus = 0;
-
   /// While we walk the potentially-inlined instructions, we build up and
   /// maintain a mapping of simplified values specific to this callsite. The
   /// idea is to propagate any special information we have about arguments to
@@ -174,12 +232,10 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 
   /// Keep track of the values which map back (through function arguments) to
   /// allocas on the caller stack which could be simplified through SROA.
-  DenseMap<Value *, Value *> SROAArgValues;
-
-  /// The mapping of caller Alloca values to their accumulated cost savings. If
-  /// we have to disable SROA for one of the allocas, this tells us how much
-  /// cost must be added.
-  DenseMap<Value *, int> SROAArgCosts;
+  /// We can disable an association (because for some reason the SROA oportunity
+  /// is lost) by setting the value to nullptr. We don't delete because we still
+  /// want isAllocaDerivedArg to function correctly.
+  DenseMap<Value *, AllocaInst *> SROAArgValues;
 
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -196,17 +252,18 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// loads.
   bool EnableLoadElimination;
   SmallPtrSet<Value *, 16> LoadAddrSet;
-  int LoadEliminationCost = 0;
+
+  AllocaInst *getSROAArgForValueOrNull(Value *V) const {
+    auto It = SROAArgValues.find(V);
+    if (It == SROAArgValues.end())
+      return nullptr;
+    return It->second;
+  }
 
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
-  bool lookupSROAArgAndCost(Value *V, Value *&Arg,
-                            DenseMap<Value *, int>::iterator &CostIt);
-  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
   void disableSROA(Value *V);
   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
-  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
-                          int InstructionCost);
   void disableLoadElimination();
   bool isGEPFree(GetElementPtrInst &GEP);
   bool canFoldInboundsGEP(GetElementPtrInst &I);
@@ -227,32 +284,13 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
-  /// Update Threshold based on callsite properties such as callee
-  /// attributes and callee hotness for PGO builds. The Callee is explicitly
-  /// passed to support analyzing indirect calls whose target is inferred by
-  /// analysis.
-  void updateThreshold(CallBase &Call, Function &Callee);
-
   /// Return true if size growth is allowed when inlining the callee at \p Call.
   bool allowSizeGrowth(CallBase &Call);
 
-  /// Return true if \p Call is a cold callsite.
-  bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
-
-  /// Return a higher threshold if \p Call is a hot callsite.
-  Optional<int> getHotCallSiteThreshold(CallBase &Call,
-                                        BlockFrequencyInfo *CallerBFI);
-
   // Custom analysis routines.
   InlineResult analyzeBlock(BasicBlock *BB,
                             SmallPtrSetImpl<const Value *> &EphValues);
 
-  /// Handle a capped 'int' increment for Cost.
-  void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
-    assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
-    Cost = (int)std::min(UpperBound, Cost + Inc);
-  }
-
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
   void visit(Module *);
@@ -298,20 +336,13 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
-               Function &Callee, CallBase &Call, const InlineParams &Params,
-               bool BoostIndirect = true)
+               Function &Callee, CallBase &Call)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
-        CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
-        ComputeFullInlineCost(OptComputeFullInlineCost ||
-                              Params.ComputeFullInlineCost || ORE),
-        BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {}
+        CandidateCall(Call), EnableLoadElimination(true) {}
 
   InlineResult analyze();
 
-  int getThreshold() { return Threshold; }
-  int getCost() { return Cost; }
-
   // Keep a bunch of stats about the cost savings found so we can print them
   // out when debugging.
   unsigned NumConstantArgs = 0;
@@ -320,12 +351,283 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   unsigned NumConstantPtrCmps = 0;
   unsigned NumConstantPtrDiffs = 0;
   unsigned NumInstructionsSimplified = 0;
+
+  void dump();
+};
+
+/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
+/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
+class InlineCostCallAnalyzer final : public CallAnalyzer {
+  const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+  const bool ComputeFullInlineCost;
+  int LoadEliminationCost = 0;
+  /// Bonus to be applied when percentage of vector instructions in callee is
+  /// high (see more details in updateThreshold).
+  int VectorBonus = 0;
+  /// Bonus to be applied when the callee has only one reachable basic block.
+  int SingleBBBonus = 0;
+
+  /// Tunable parameters that control the analysis.
+  const InlineParams &Params;
+
+  /// Upper bound for the inlining cost. Bonuses are being applied to account
+  /// for speculative "expected profit" of the inlining decision.
+  int Threshold = 0;
+
+  /// Attempt to evaluate indirect calls to boost its inline cost.
+  const bool BoostIndirectCalls;
+
+  /// Inlining cost measured in abstract units, accounts for all the
+  /// instructions expected to be executed for a given function invocation.
+  /// Instructions that are statically proven to be dead based on call-site
+  /// arguments are not counted here.
+  int Cost = 0;
+
+  bool SingleBB = true;
+
   unsigned SROACostSavings = 0;
   unsigned SROACostSavingsLost = 0;
 
+  /// The mapping of caller Alloca values to their accumulated cost savings. If
+  /// we have to disable SROA for one of the allocas, this tells us how much
+  /// cost must be added.
+  DenseMap<AllocaInst *, int> SROAArgCosts;
+
+  /// Return true if \p Call is a cold callsite.
+  bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
+
+  /// Update Threshold based on callsite properties such as callee
+  /// attributes and callee hotness for PGO builds. The Callee is explicitly
+  /// passed to support analyzing indirect calls whose target is inferred by
+  /// analysis.
+  void updateThreshold(CallBase &Call, Function &Callee);
+  /// Return a higher threshold if \p Call is a hot callsite.
+  Optional<int> getHotCallSiteThreshold(CallBase &Call,
+                                        BlockFrequencyInfo *CallerBFI);
+
+  /// Handle a capped 'int' increment for Cost.
+  void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
+    assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
+    Cost = (int)std::min(UpperBound, Cost + Inc);
+  }
+
+  void onDisableSROA(AllocaInst *Arg) override {
+    auto CostIt = SROAArgCosts.find(Arg);
+    if (CostIt == SROAArgCosts.end())
+      return;
+    addCost(CostIt->second);
+    SROACostSavings -= CostIt->second;
+    SROACostSavingsLost += CostIt->second;
+    SROAArgCosts.erase(CostIt);
+  }
+
+  void onDisableLoadElimination() override {
+    addCost(LoadEliminationCost);
+    LoadEliminationCost = 0;
+  }
+  void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
+  void onCallArgumentSetup(const CallBase &Call) override {
+    // Pay the price of the argument setup. We account for the average 1
+    // instruction per call argument setup here.
+    addCost(Call.arg_size() * InlineConstants::InstrCost);
+  }
+  void onLoadRelativeIntrinsic() override {
+    // This is normally lowered to 4 LLVM instructions.
+    addCost(3 * InlineConstants::InstrCost);
+  }
+  void onLoweredCall(Function *F, CallBase &Call,
+                     bool IsIndirectCall) override {
+    // We account for the average 1 instruction per call argument setup here.
+    addCost(Call.arg_size() * InlineConstants::InstrCost);
+
+    // If we have a constant that we are calling as a function, we can peer
+    // through it and see the function target. This happens not infrequently
+    // during devirtualization and so we want to give it a hefty bonus for
+    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
+    // Pretend to inline the function, with a custom threshold.
+    if (IsIndirectCall && BoostIndirectCalls) {
+      auto IndirectCallParams = Params;
+      IndirectCallParams.DefaultThreshold =
+          InlineConstants::IndirectCallThreshold;
+      /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
+      /// to instantiate the derived class.
+      InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
+                                Call, IndirectCallParams, false);
+      if (CA.analyze()) {
+        // We were able to inline the indirect call! Subtract the cost from the
+        // threshold to get the bonus we want to apply, but don't go below zero.
+        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+      }
+    } else
+      // Otherwise simply add the cost for merely making the call.
+      addCost(InlineConstants::CallPenalty);
+  }
+
+  void onFinalizeSwitch(unsigned JumpTableSize,
+                        unsigned NumCaseCluster) override {
+    // If suitable for a jump table, consider the cost for the table size and
+    // branch to destination.
+    // Maximum valid cost increased in this function.
+    if (JumpTableSize) {
+      int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+                       4 * InlineConstants::InstrCost;
+
+      addCost(JTCost, (int64_t)CostUpperBound);
+      return;
+    }
+    // Considering forming a binary search, we should find the number of nodes
+    // which is same as the number of comparisons when lowered. For a given
+    // number of clusters, n, we can define a recursive function, f(n), to find
+    // the number of nodes in the tree. The recursion is :
+    // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+    // and f(n) = n, when n <= 3.
+    // This will lead a binary tree where the leaf should be either f(2) or f(3)
+    // when n > 3.  So, the number of comparisons from leaves should be n, while
+    // the number of non-leaf should be :
+    //   2^(log2(n) - 1) - 1
+    //   = 2^log2(n) * 2^-1 - 1
+    //   = n / 2 - 1.
+    // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+    // number of comparisons in a simple closed form :
+    //   n + n / 2 - 1 = n * 3 / 2 - 1
+    if (NumCaseCluster <= 3) {
+      // Suppose a comparison includes one compare and one conditional branch.
+      addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
+      return;
+    }
+
+    int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+    int64_t SwitchCost =
+        ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+    addCost(SwitchCost, (int64_t)CostUpperBound);
+  }
+  void onCommonInstructionSimplification() override {
+    addCost(InlineConstants::InstrCost);
+  }
+
+  void onInitializeSROAArg(AllocaInst *Arg) override { SROAArgCosts[Arg] = 0; }
+
+  void onAggregateSROAUse(AllocaInst *SROAArg) override {
+    auto CostIt = SROAArgCosts.find(SROAArg);
+    CostIt->second += InlineConstants::InstrCost;
+    SROACostSavings += InlineConstants::InstrCost;
+  }
+
+  virtual void onBlockAnalyzed(const BasicBlock *BB) override {
+    auto *TI = BB->getTerminator();
+    // If we had any successors at this point, than post-inlining is likely to
+    // have them as well. Note that we assume any basic blocks which existed
+    // due to branches or switches which folded above will also fold after
+    // inlining.
+    if (SingleBB && TI->getNumSuccessors() > 1) {
+      // Take off the bonus we applied to the threshold.
+      Threshold -= SingleBBBonus;
+      SingleBB = false;
+    }
+  }
+  virtual InlineResult finalizeAnalysis() override {
+    // Loops generally act a lot like calls in that they act like barriers to
+    // movement, require a certain amount of setup, etc. So when optimising for
+    // size, we penalise any call sites that perform loops. We do this after all
+    // other costs here, so will likely only be dealing with relatively small
+    // functions (and hence DT and LI will hopefully be cheap).
+    auto *Caller = CandidateCall.getFunction();
+    if (Caller->hasMinSize()) {
+      DominatorTree DT(F);
+      LoopInfo LI(DT);
+      int NumLoops = 0;
+      for (Loop *L : LI) {
+        // Ignore loops that will not be executed
+        if (DeadBlocks.count(L->getHeader()))
+          continue;
+        NumLoops++;
+      }
+      addCost(NumLoops * InlineConstants::CallPenalty);
+    }
+
+    // We applied the maximum possible vector bonus at the beginning. Now,
+    // subtract the excess bonus, if any, from the Threshold before
+    // comparing against Cost.
+    if (NumVectorInstructions <= NumInstructions / 10)
+      Threshold -= VectorBonus;
+    else if (NumVectorInstructions <= NumInstructions / 2)
+      Threshold -= VectorBonus / 2;
+
+    return Cost < std::max(1, Threshold);
+  }
+  virtual bool shouldStop() override {
+    // Bail out the moment we cross the threshold. This means we'll under-count
+    // the cost, but only when undercounting doesn't matter.
+    return Cost >= Threshold && !ComputeFullInlineCost;
+  }
+
+  virtual void onLoadEliminationOpportunity() {
+    LoadEliminationCost += InlineConstants::InstrCost;
+  }
+
+  InlineResult onAnalysisStart() override {
+    // Perform some tweaks to the cost and threshold based on the direct
+    // callsite information.
+
+    // We want to more aggressively inline vector-dense kernels, so up the
+    // threshold, and we'll lower it if the % of vector instructions gets too
+    // low. Note that these bonuses are some what arbitrary and evolved over
+    // time by accident as much as because they are principled bonuses.
+    //
+    // FIXME: It would be nice to remove all such bonuses. At least it would be
+    // nice to base the bonus values on something more scientific.
+    assert(NumInstructions == 0);
+    assert(NumVectorInstructions == 0);
+
+    // Update the threshold based on callsite properties
+    updateThreshold(CandidateCall, F);
+
+    // While Threshold depends on commandline options that can take negative
+    // values, we want to enforce the invariant that the computed threshold and
+    // bonuses are non-negative.
+    assert(Threshold >= 0);
+    assert(SingleBBBonus >= 0);
+    assert(VectorBonus >= 0);
+
+    // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+    // this Threshold any time, and cost cannot decrease, we can stop processing
+    // the rest of the function body.
+    Threshold += (SingleBBBonus + VectorBonus);
+
+    // Give out bonuses for the callsite, as the instructions setting them up
+    // will be gone after inlining.
+    addCost(-getCallsiteCost(this->CandidateCall, DL));
+
+    // If this function uses the coldcc calling convention, prefer not to inline
+    // it.
+    if (F.getCallingConv() == CallingConv::Cold)
+      Cost += InlineConstants::ColdccPenalty;
+
+    // Check if we're done. This can happen due to bonuses and penalties.
+    if (Cost >= Threshold && !ComputeFullInlineCost)
+      return "high cost";
+
+    return true;
+  }
+
+public:
+  InlineCostCallAnalyzer(
+      const TargetTransformInfo &TTI,
+      std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
+      Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
+      ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
+      CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
+      : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
+        ComputeFullInlineCost(OptComputeFullInlineCost ||
+                              Params.ComputeFullInlineCost || ORE),
+        Params(Params), Threshold(Params.DefaultThreshold),
+        BoostIndirectCalls(BoostIndirect) {}
   void dump();
-};
 
+  int getThreshold() { return Threshold; }
+  int getCost() { return Cost; }
+};
 } // namespace
 
 /// Test whether the given value is an Alloca-derived function argument.
@@ -333,55 +635,22 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
   return SROAArgValues.count(V);
 }
 
-/// Lookup the SROA-candidate argument and cost iterator which V maps to.
-/// Returns false if V does not map to a SROA-candidate.
-bool CallAnalyzer::lookupSROAArgAndCost(
-    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
-  if (SROAArgValues.empty() || SROAArgCosts.empty())
-    return false;
-
-  DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
-  if (ArgIt == SROAArgValues.end())
-    return false;
-
-  Arg = ArgIt->second;
-  CostIt = SROAArgCosts.find(Arg);
-  return CostIt != SROAArgCosts.end();
-}
-
-/// Disable SROA for the candidate marked by this cost iterator.
-///
-/// This marks the candidate as no longer viable for SROA, and adds the cost
-/// savings associated with it back into the inline cost measurement.
-void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
-  // If we're no longer able to perform SROA we need to undo its cost savings
-  // and prevent subsequent analysis.
-  addCost(CostIt->second);
-  SROACostSavings -= CostIt->second;
-  SROACostSavingsLost += CostIt->second;
-  SROAArgCosts.erase(CostIt);
-  disableLoadElimination();
-}
-
 /// If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(V, SROAArg, CostIt))
-    disableSROA(CostIt);
-}
-
-/// Accumulate the given cost for a particular SROA candidate.
-void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
-                                      int InstructionCost) {
-  CostIt->second += InstructionCost;
-  SROACostSavings += InstructionCost;
+  auto It = SROAArgValues.find(V);
+  if (It == SROAArgValues.end())
+    return;
+  auto *SROAArg = It->second;
+  if (!SROAArg)
+    return;
+  It->second = nullptr;
+  onDisableSROA(SROAArg);
+  disableLoadElimination();
 }
 
 void CallAnalyzer::disableLoadElimination() {
   if (EnableLoadElimination) {
-    addCost(LoadEliminationCost);
-    LoadEliminationCost = 0;
+    onDisableLoadElimination();
     EnableLoadElimination = false;
   }
 }
@@ -553,9 +822,7 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
   if (FirstBaseAndOffset.first) {
     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
 
-    Value *SROAArg;
-    DenseMap<Value *, int>::iterator CostIt;
-    if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
+    if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
       SROAArgValues[&I] = SROAArg;
   }
 
@@ -585,10 +852,8 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
 }
 
 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
-  Value *SROAArg;
   DenseMap<Value *, int>::iterator CostIt;
-  bool SROACandidate =
-      lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
+  auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
 
   // Lambda to check whether a GEP's indices are all constant.
   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
@@ -599,7 +864,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   };
 
   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
-    if (SROACandidate)
+    if (SROAArg)
       SROAArgValues[&I] = SROAArg;
 
     // Constant GEPs are modeled as free.
@@ -607,8 +872,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   }
 
   // Variable GEPs will require math and will disable SROA.
-  if (SROACandidate)
-    disableSROA(CostIt);
+  if (SROAArg)
+    disableSROA(SROAArg);
   return isGEPFree(I);
 }
 
@@ -648,9 +913,7 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
     ConstantOffsetPtrs[&I] = BaseAndOffset;
 
   // Also look for SROA candidates here.
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
     SROAArgValues[&I] = SROAArg;
 
   // Bitcasts are always zero cost.
@@ -682,9 +945,7 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // and so we can just add the integer in here. The only places where SROA is
   // preserved either cannot fire on an integer, or won't in-and-of themselves
   // disable SROA (ext) w/o some later use that we would see and disable.
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
+  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
     SROAArgValues[&I] = SROAArg;
 
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -708,9 +969,7 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   }
 
   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
+  if (auto *SROAArg = getSROAArgForValueOrNull(Op))
     SROAArgValues[&I] = SROAArg;
 
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -737,7 +996,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   case Instruction::FPToUI:
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-      addCost(InlineConstants::CallPenalty);
+      onCallPenalty();
     break;
   default:
     break;
@@ -810,8 +1069,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
   return true;
 }
 
-bool CallAnalyzer::isColdCallSite(CallBase &Call,
-                                  BlockFrequencyInfo *CallerBFI) {
+bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
+                                            BlockFrequencyInfo *CallerBFI) {
   // If global profile summary is available, then callsite's coldness is
   // determined based on that.
   if (PSI && PSI->hasProfileSummary())
@@ -834,8 +1093,8 @@ bool CallAnalyzer::isColdCallSite(CallBase &Call,
 }
 
 Optional<int>
-CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
-                                      BlockFrequencyInfo *CallerBFI) {
+InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
+                                                BlockFrequencyInfo *CallerBFI) {
 
   // If global profile summary is available, then callsite's hotness is
   // determined based on that.
@@ -862,7 +1121,7 @@ CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
   return None;
 }
 
-void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
+void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // If no size growth is allowed for this inlining, set Threshold to 0.
   if (!allowSizeGrowth(Call)) {
     Threshold = 0;
@@ -1024,19 +1283,7 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
                                       : ConstantInt::getFalse(I.getType());
     return true;
   }
-  // Finally check for SROA candidates in comparisons.
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
-    if (isa<ConstantPointerNull>(I.getOperand(1))) {
-      accumulateSROACost(CostIt, InlineConstants::InstrCost);
-      return true;
-    }
-
-    disableSROA(CostIt);
-  }
-
-  return false;
+  return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
 }
 
 bool CallAnalyzer::visitSub(BinaryOperator &I) {
@@ -1100,7 +1347,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   if (I.getType()->isFloatingPointTy() &&
       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
       !match(&I, m_FNeg(m_Value())))
-    addCost(InlineConstants::CallPenalty);
+    onCallPenalty();
 
   return false;
 }
@@ -1127,23 +1374,15 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
 }
 
 bool CallAnalyzer::visitLoad(LoadInst &I) {
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
-    if (I.isSimple()) {
-      accumulateSROACost(CostIt, InlineConstants::InstrCost);
-      return true;
-    }
-
-    disableSROA(CostIt);
-  }
+  if (handleSROA(I.getPointerOperand(), I.isSimple()))
+    return true;
 
   // If the data is already loaded from this address and hasn't been clobbered
   // by any stores or calls, this load is likely to be redundant and can be
   // eliminated.
   if (EnableLoadElimination &&
       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
-    LoadEliminationCost += InlineConstants::InstrCost;
+    onLoadEliminationOpportunity();
     return true;
   }
 
@@ -1151,16 +1390,8 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
 }
 
 bool CallAnalyzer::visitStore(StoreInst &I) {
-  Value *SROAArg;
-  DenseMap<Value *, int>::iterator CostIt;
-  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
-    if (I.isSimple()) {
-      accumulateSROACost(CostIt, InlineConstants::InstrCost);
-      return true;
-    }
-
-    disableSROA(CostIt);
-  }
+  if (handleSROA(I.getPointerOperand(), I.isSimple()))
+    return true;
 
   // The store can potentially clobber loads and prevent repeated loads from
   // being eliminated.
@@ -1250,9 +1481,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
     // in this inline context. If not, we've done all we can.
     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
     if (!F) {
-      // Pay the price of the argument setup. We account for the average 1
-      // instruction per call argument setup here.
-      addCost(Call.arg_size() * InlineConstants::InstrCost);
+      onCallArgumentSetup(Call);
 
       if (!Call.onlyReadsMemory())
         disableLoadElimination();
@@ -1276,8 +1505,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
       return Base::visitCallBase(Call);
 
     case Intrinsic::load_relative:
-      // This is normally lowered to 4 LLVM instructions.
-      addCost(3 * InlineConstants::InstrCost);
+      onLoadRelativeIntrinsic();
       return false;
 
     case Intrinsic::memset:
@@ -1304,28 +1532,7 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
   }
 
   if (TTI.isLoweredToCall(F)) {
-    // We account for the average 1 instruction per call argument setup here.
-    addCost(Call.arg_size() * InlineConstants::InstrCost);
-
-    // If we have a constant that we are calling as a function, we can peer
-    // through it and see the function target. This happens not infrequently
-    // during devirtualization and so we want to give it a hefty bonus for
-    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
-    // Pretend to inline the function, with a custom threshold.
-    if (IsIndirectCall && BoostIndirectCalls) {
-      auto IndirectCallParams = Params;
-      IndirectCallParams.DefaultThreshold =
-          InlineConstants::IndirectCallThreshold;
-      CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
-                      IndirectCallParams, false);
-      if (CA.analyze()) {
-        // We were able to inline the indirect call! Subtract the cost from the
-        // threshold to get the bonus we want to apply, but don't go below zero.
-        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
-      }
-    } else
-      // Otherwise simply add the cost for merely making the call.
-      addCost(InlineConstants::CallPenalty);
+    onLoweredCall(F, Call, IsIndirectCall);
   }
 
   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
@@ -1381,9 +1588,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
 
-      Value *SROAArg;
-      DenseMap<Value *, int>::iterator CostIt;
-      if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
+      if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
         SROAArgValues[&SI] = SROAArg;
       return true;
     }
@@ -1422,9 +1627,7 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
   if (BaseAndOffset.first) {
     ConstantOffsetPtrs[&SI] = BaseAndOffset;
 
-    Value *SROAArg;
-    DenseMap<Value *, int>::iterator CostIt;
-    if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
+    if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
       SROAArgValues[&SI] = SROAArg;
   }
 
@@ -1452,49 +1655,12 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // inlining those. It will prevent inlining in cases where the optimization
   // does not (yet) fire.
 
-  // Maximum valid cost increased in this function.
-  int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
-
   unsigned JumpTableSize = 0;
   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
-  // If suitable for a jump table, consider the cost for the table size and
-  // branch to destination.
-  if (JumpTableSize) {
-    int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
-                     4 * InlineConstants::InstrCost;
-
-    addCost(JTCost, (int64_t)CostUpperBound);
-    return false;
-  }
-
-  // Considering forming a binary search, we should find the number of nodes
-  // which is same as the number of comparisons when lowered. For a given
-  // number of clusters, n, we can define a recursive function, f(n), to find
-  // the number of nodes in the tree. The recursion is :
-  // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
-  // and f(n) = n, when n <= 3.
-  // This will lead a binary tree where the leaf should be either f(2) or f(3)
-  // when n > 3.  So, the number of comparisons from leaves should be n, while
-  // the number of non-leaf should be :
-  //   2^(log2(n) - 1) - 1
-  //   = 2^log2(n) * 2^-1 - 1
-  //   = n / 2 - 1.
-  // Considering comparisons from leaf and non-leaf nodes, we can estimate the
-  // number of comparisons in a simple closed form :
-  //   n + n / 2 - 1 = n * 3 / 2 - 1
-  if (NumCaseCluster <= 3) {
-    // Suppose a comparison includes one compare and one conditional branch.
-    addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
-    return false;
-  }
-
-  int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
-  int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-
-  addCost(SwitchCost, (int64_t)CostUpperBound);
+  onFinalizeSwitch(JumpTableSize, NumCaseCluster);
   return false;
 }
 
@@ -1587,7 +1753,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      addCost(InlineConstants::InstrCost);
+      onCommonInstructionSimplification();
 
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
@@ -1632,9 +1798,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
       return IR;
     }
 
-    // Check if we've passed the maximum possible threshold so we don't spin in
-    // huge basic blocks that will never inline.
-    if (Cost >= Threshold && !ComputeFullInlineCost)
+    if (shouldStop())
       return false;
   }
 
@@ -1728,46 +1892,9 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
 InlineResult CallAnalyzer::analyze() {
   ++NumCallsAnalyzed;
 
-  // Perform some tweaks to the cost and threshold based on the direct
-  // callsite information.
-
-  // We want to more aggressively inline vector-dense kernels, so up the
-  // threshold, and we'll lower it if the % of vector instructions gets too
-  // low. Note that these bonuses are some what arbitrary and evolved over time
-  // by accident as much as because they are principled bonuses.
-  //
-  // FIXME: It would be nice to remove all such bonuses. At least it would be
-  // nice to base the bonus values on something more scientific.
-  assert(NumInstructions == 0);
-  assert(NumVectorInstructions == 0);
-
-  // Update the threshold based on callsite properties
-  updateThreshold(CandidateCall, F);
-
-  // While Threshold depends on commandline options that can take negative
-  // values, we want to enforce the invariant that the computed threshold and
-  // bonuses are non-negative.
-  assert(Threshold >= 0);
-  assert(SingleBBBonus >= 0);
-  assert(VectorBonus >= 0);
-
-  // Speculatively apply all possible bonuses to Threshold. If cost exceeds
-  // this Threshold any time, and cost cannot decrease, we can stop processing
-  // the rest of the function body.
-  Threshold += (SingleBBBonus + VectorBonus);
-
-  // Give out bonuses for the callsite, as the instructions setting them up
-  // will be gone after inlining.
-  addCost(-getCallsiteCost(CandidateCall, DL));
-
-  // If this function uses the coldcc calling convention, prefer not to inline
-  // it.
-  if (F.getCallingConv() == CallingConv::Cold)
-    Cost += InlineConstants::ColdccPenalty;
-
-  // Check if we're done. This can happen due to bonuses and penalties.
-  if (Cost >= Threshold && !ComputeFullInlineCost)
-    return "high cost";
+  auto Result = onAnalysisStart();
+  if (!Result)
+    return Result;
 
   if (F.empty())
     return true;
@@ -1796,9 +1923,9 @@ InlineResult CallAnalyzer::analyze() {
       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
 
       // We can SROA any pointer arguments derived from alloca instructions.
-      if (isa<AllocaInst>(PtrArg)) {
-        SROAArgValues[&*FAI] = PtrArg;
-        SROAArgCosts[PtrArg] = 0;
+      if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
+        SROAArgValues[&*FAI] = SROAArg;
+        onInitializeSROAArg(SROAArg);
       }
     }
   }
@@ -1824,12 +1951,10 @@ InlineResult CallAnalyzer::analyze() {
       BBSetVector;
   BBSetVector BBWorklist;
   BBWorklist.insert(&F.getEntryBlock());
-  bool SingleBB = true;
+
   // Note that we *must not* cache the size, this loop grows the worklist.
   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
-    // Bail out the moment we cross the threshold. This means we'll under-count
-    // the cost, but only when undercounting doesn't matter.
-    if (Cost >= Threshold && !ComputeFullInlineCost)
+    if (shouldStop())
       break;
 
     BasicBlock *BB = BBWorklist[Idx];
@@ -1889,15 +2014,7 @@ InlineResult CallAnalyzer::analyze() {
          ++TIdx)
       BBWorklist.insert(TI->getSuccessor(TIdx));
 
-    // If we had any successors at this point, than post-inlining is likely to
-    // have them as well. Note that we assume any basic blocks which existed
-    // due to branches or switches which folded above will also fold after
-    // inlining.
-    if (SingleBB && TI->getNumSuccessors() > 1) {
-      // Take off the bonus we applied to the threshold.
-      Threshold -= SingleBBBonus;
-      SingleBB = false;
-    }
+    onBlockAnalyzed(BB);
   }
 
   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
@@ -1908,38 +2025,12 @@ InlineResult CallAnalyzer::analyze() {
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
     return "noduplicate";
 
-  // Loops generally act a lot like calls in that they act like barriers to
-  // movement, require a certain amount of setup, etc. So when optimising for
-  // size, we penalise any call sites that perform loops. We do this after all
-  // other costs here, so will likely only be dealing with relatively small
-  // functions (and hence DT and LI will hopefully be cheap).
-  if (Caller->hasMinSize()) {
-    DominatorTree DT(F);
-    LoopInfo LI(DT);
-    int NumLoops = 0;
-    for (Loop *L : LI) {
-      // Ignore loops that will not be executed
-      if (DeadBlocks.count(L->getHeader()))
-        continue;
-      NumLoops++;
-    }
-    addCost(NumLoops * InlineConstants::CallPenalty);
-  }
-
-  // We applied the maximum possible vector bonus at the beginning. Now,
-  // subtract the excess bonus, if any, from the Threshold before
-  // comparing against Cost.
-  if (NumVectorInstructions <= NumInstructions / 10)
-    Threshold -= VectorBonus;
-  else if (NumVectorInstructions <= NumInstructions / 2)
-    Threshold -= VectorBonus / 2;
-
-  return Cost < std::max(1, Threshold);
+  return finalizeAnalysis();
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void CallAnalyzer::dump() {
+LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   DEBUG_PRINT_STAT(NumConstantArgs);
   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
@@ -2073,8 +2164,8 @@ InlineCost llvm::getInlineCost(
   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                           << "... (caller:" << Caller->getName() << ")\n");
 
-  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee,
-                  Call, Params);
+  InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
+                            *Callee, Call, Params);
   InlineResult ShouldInline = CA.analyze();
 
   LLVM_DEBUG(CA.dump());
@@ -2121,15 +2212,16 @@ InlineResult llvm::isInlineViable(Function &F) {
         switch (Call->getCalledFunction()->getIntrinsicID()) {
         default:
           break;
-        // Disallow inlining of @llvm.icall.branch.funnel because current
-        // backend can't separate call targets from call arguments.
+          // Disallow inlining of @llvm.icall.branch.funnel because current
+          // backend can't separate call targets from call arguments.
         case llvm::Intrinsic::icall_branch_funnel:
           return "disallowed inlining of @llvm.icall.branch.funnel";
-        // Disallow inlining functions that call @llvm.localescape. Doing this
-        // correctly would require major changes to the inliner.
+          // Disallow inlining functions that call @llvm.localescape. Doing this
+          // correctly would require major changes to the inliner.
         case llvm::Intrinsic::localescape:
           return "disallowed inlining of @llvm.localescape";
-        // Disallow inlining of functions that initialize VarArgs with va_start.
+          // Disallow inlining of functions that initialize VarArgs with
+          // va_start.
         case llvm::Intrinsic::vastart:
           return "contains VarArgs initialized with va_start";
         }

From 1e46eb74be6527377e47090bbe0fc9298f7de2c5 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Tue, 7 Jan 2020 15:10:30 -0600
Subject: [PATCH 12/89] [Attributor][FIX] Avoid dangling value pointers during
 code modification

When we replace instructions with unreachable we delete instructions. We
now avoid dangling pointers to those deleted instructions in the
`ToBeChangedToUnreachableInsts` set. Other modification collections
might need to be updated in the future as well.
---
 llvm/include/llvm/IR/ValueHandle.h            | 19 +++++++++++++++++++
 llvm/include/llvm/Transforms/IPO/Attributor.h |  2 +-
 llvm/lib/Transforms/IPO/Attributor.cpp        |  5 +++--
 .../Attributor/undefined_behavior.ll          | 10 ++++++++++
 4 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/IR/ValueHandle.h b/llvm/include/llvm/IR/ValueHandle.h
index 11ac2a8608d81..50b7701f67162 100644
--- a/llvm/include/llvm/IR/ValueHandle.h
+++ b/llvm/include/llvm/IR/ValueHandle.h
@@ -171,6 +171,25 @@ template <> struct simplify_type<const WeakVH> {
   static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; }
 };
 
+// Specialize DenseMapInfo to allow WeakVH to participate in DenseMap.
+template <> struct DenseMapInfo<WeakVH> {
+  static inline WeakVH getEmptyKey() {
+    return WeakVH(DenseMapInfo<Value *>::getEmptyKey());
+  }
+
+  static inline WeakVH getTombstoneKey() {
+    return WeakVH(DenseMapInfo<Value *>::getTombstoneKey());
+  }
+
+  static unsigned getHashValue(const WeakVH &Val) {
+    return DenseMapInfo<Value *>::getHashValue(Val);
+  }
+
+  static bool isEqual(const WeakVH &LHS, const WeakVH &RHS) {
+    return DenseMapInfo<Value *>::isEqual(LHS, RHS);
+  }
+};
+
 /// Value handle that is nullable, but tries to track the Value.
 ///
 /// This is a value handle that tries hard to point to a Value, even across
diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 53236b54ff0f0..4edfec2e982dd 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -1174,7 +1174,7 @@ struct Attributor {
   DenseMap<Use *, Value *> ToBeChangedUses;
 
   /// Instructions we replace with `unreachable` insts after manifest is done.
-  SmallPtrSet<Instruction *, 8> ToBeChangedToUnreachableInsts;
+  SmallDenseSet<WeakVH, 16> ToBeChangedToUnreachableInsts;
 
   /// Functions, blocks, and instructions we delete after manifest is done.
   ///
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 13fcf6aa72472..b3b9e1e185a6c 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -5668,8 +5668,9 @@ ChangeStatus Attributor::run(Module &M) {
         }
       }
     }
-    for (Instruction *I : ToBeChangedToUnreachableInsts)
-      changeToUnreachable(I, /* UseLLVMTrap */ false);
+    for (auto &V : ToBeChangedToUnreachableInsts)
+      if (Instruction *I = dyn_cast_or_null<Instruction>(V))
+        changeToUnreachable(I, /* UseLLVMTrap */ false);
     for (Instruction *I : TerminatorsToFold)
       ConstantFoldTerminator(I->getParent());
 
diff --git a/llvm/test/Transforms/Attributor/undefined_behavior.ll b/llvm/test/Transforms/Attributor/undefined_behavior.ll
index e9b782452182f..fd0ddb1ebb850 100644
--- a/llvm/test/Transforms/Attributor/undefined_behavior.ll
+++ b/llvm/test/Transforms/Attributor/undefined_behavior.ll
@@ -16,6 +16,16 @@ define void @load_wholly_unreachable() {
   ret void
 }
 
+define void @loads_wholly_unreachable() {
+; ATTRIBUTOR-LABEL: @loads_wholly_unreachable(
+; ATTRIBUTOR-NEXT:    unreachable
+;
+  %a = load i32, i32* null
+  %b = load i32, i32* null
+  ret void
+}
+
+
 define void @load_single_bb_unreachable(i1 %cond) {
 ; ATTRIBUTOR-LABEL: @load_single_bb_unreachable(
 ; ATTRIBUTOR-NEXT:    br i1 [[COND:%.*]], label [[T:%.*]], label [[E:%.*]]

From a4088c75cc1034307400076d29b35905d0ae58b2 Mon Sep 17 00:00:00 2001
From: Johannes Doerfert <johannes@jdoerfert.de>
Date: Tue, 7 Jan 2020 16:01:57 -0600
Subject: [PATCH 13/89] [Attributor][FIX] Carefully change invokes to calls
 (after manifest)

Before we manually inserted unreachable early but that could lead to
broken PHI nodes. Now we use the existing late modification
functionality.
---
 llvm/include/llvm/Transforms/IPO/Attributor.h |  10 ++
 llvm/lib/Transforms/IPO/Attributor.cpp        | 119 ++++++------------
 .../Attributor/ArgumentPromotion/crash.ll     |   8 --
 .../Attributor/ArgumentPromotion/fp80.ll      |   2 -
 .../live_called_from_dead.ll                  |   2 +-
 .../nonzero-address-spaces.ll                 |   2 -
 .../Attributor/IPConstantProp/recursion.ll    |   2 -
 .../IPConstantProp/return-constant.ll         |   2 -
 llvm/test/Transforms/Attributor/liveness.ll   | 103 +++++++++++++--
 .../Transforms/Attributor/noreturn_async.ll   |   4 +-
 10 files changed, 144 insertions(+), 110 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/Attributor.h b/llvm/include/llvm/Transforms/IPO/Attributor.h
index 4edfec2e982dd..70642b201adb3 100644
--- a/llvm/include/llvm/Transforms/IPO/Attributor.h
+++ b/llvm/include/llvm/Transforms/IPO/Attributor.h
@@ -865,6 +865,13 @@ struct Attributor {
     ToBeChangedToUnreachableInsts.insert(I);
   }
 
+  /// Record that \p II has at least one dead successor block. This information
+  /// is used, e.g., to replace \p II with a call, after information was
+  /// manifested.
+  void registerInvokeWithDeadSuccessor(InvokeInst &II) {
+    InvokeWithDeadSuccessor.push_back(&II);
+  }
+
   /// Record that \p I is deleted after information was manifested. This also
   /// triggers deletion of trivially dead istructions.
   void deleteAfterManifest(Instruction &I) { ToBeDeletedInsts.insert(&I); }
@@ -1176,6 +1183,9 @@ struct Attributor {
   /// Instructions we replace with `unreachable` insts after manifest is done.
   SmallDenseSet<WeakVH, 16> ToBeChangedToUnreachableInsts;
 
+  /// Invoke instructions with at least a single dead successor block.
+  SmallVector<WeakVH, 16> InvokeWithDeadSuccessor;
+
   /// Functions, blocks, and instructions we delete after manifest is done.
   ///
   ///{
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index b3b9e1e185a6c..39e2057b1b6ee 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2824,90 +2825,12 @@ struct AAIsDeadFunction : public AAIsDead {
       bool MayReturn = !NoReturnAA.isAssumedNoReturn();
       if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
         continue;
-      Instruction *I = const_cast<Instruction *>(DeadEndI);
-      BasicBlock *BB = I->getParent();
-      Instruction *SplitPos = I->getNextNode();
-      // TODO: mark stuff before unreachable instructions as dead.
-
-      if (auto *II = dyn_cast<InvokeInst>(I)) {
-        // If we keep the invoke the split position is at the beginning of the
-        // normal desitination block (it invokes a noreturn function after all).
-        BasicBlock *NormalDestBB = II->getNormalDest();
-        SplitPos = &NormalDestBB->front();
-
-        /// Invoke is replaced with a call and unreachable is placed after it if
-        /// the callee is nounwind and noreturn. Otherwise, we keep the invoke
-        /// and only place an unreachable in the normal successor.
-        if (Invoke2CallAllowed) {
-          if (II->getCalledFunction()) {
-            const IRPosition &IPos = IRPosition::callsite_function(*II);
-            const auto &AANoUnw = A.getAAFor<AANoUnwind>(*this, IPos);
-            if (AANoUnw.isAssumedNoUnwind()) {
-              LLVM_DEBUG(dbgs()
-                         << "[AAIsDead] Replace invoke with call inst\n");
-              CallInst *CI = createCallMatchingInvoke(II);
-              CI->insertBefore(II);
-              CI->takeName(II);
-              replaceAllInstructionUsesWith(*II, *CI);
-
-              // If this is a nounwind + mayreturn invoke we only remove the
-              // unwind edge. This is done by moving the invoke into a new and
-              // dead block and connecting the normal destination of the invoke
-              // with a branch that follows the call replacement we created
-              // above.
-              if (MayReturn) {
-                BasicBlock *NewDeadBB =
-                    SplitBlock(BB, II, nullptr, nullptr, nullptr, ".i2c");
-                assert(isa<BranchInst>(BB->getTerminator()) &&
-                       BB->getTerminator()->getNumSuccessors() == 1 &&
-                       BB->getTerminator()->getSuccessor(0) == NewDeadBB);
-                new UnreachableInst(I->getContext(), NewDeadBB);
-                BB->getTerminator()->setOperand(0, NormalDestBB);
-                A.deleteAfterManifest(*II);
-                continue;
-              }
-
-              // We do not need an invoke (II) but instead want a call followed
-              // by an unreachable. However, we do not remove II as other
-              // abstract attributes might have it cached as part of their
-              // results. Given that we modify the CFG anyway, we simply keep II
-              // around but in a new dead block. To avoid II being live through
-              // a different edge we have to ensure the block we place it in is
-              // only reached from the current block of II and then not reached
-              // at all when we insert the unreachable.
-              SplitBlockPredecessors(NormalDestBB, {BB}, ".i2c");
-              SplitPos = CI->getNextNode();
-            }
-          }
-        }
-
-        if (SplitPos == &NormalDestBB->front()) {
-          // If this is an invoke of a noreturn function the edge to the normal
-          // destination block is dead but not necessarily the block itself.
-          // TODO: We need to move to an edge based system during deduction and
-          //       also manifest.
-          assert(!NormalDestBB->isLandingPad() &&
-                 "Expected the normal destination not to be a landingpad!");
-          if (NormalDestBB->getUniquePredecessor() == BB) {
-            assumeLive(A, *NormalDestBB);
-          } else {
-            BasicBlock *SplitBB =
-                SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
-            // The split block is live even if it contains only an unreachable
-            // instruction at the end.
-            assumeLive(A, *SplitBB);
-            SplitPos = SplitBB->getTerminator();
-            HasChanged = ChangeStatus::CHANGED;
-          }
-        }
-      }
 
-      if (isa_and_nonnull<UnreachableInst>(SplitPos))
-        continue;
-
-      BB = SplitPos->getParent();
-      SplitBlock(BB, SplitPos);
-      A.changeToUnreachableAfterManifest(BB->getTerminator());
+      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+      else
+        A.changeToUnreachableAfterManifest(
+            const_cast<Instruction *>(DeadEndI->getNextNode()));
       HasChanged = ChangeStatus::CHANGED;
     }
 
@@ -5668,6 +5591,32 @@ ChangeStatus Attributor::run(Module &M) {
         }
       }
     }
+    for (auto &V : InvokeWithDeadSuccessor)
+      if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+        bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+        bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+        bool Invoke2CallAllowed =
+            !AAIsDeadFunction::mayCatchAsynchronousExceptions(
+                *II->getFunction());
+        assert((UnwindBBIsDead || NormalBBIsDead) &&
+               "Invoke does not have dead successors!");
+        BasicBlock *BB = II->getParent();
+        BasicBlock *NormalDestBB = II->getNormalDest();
+        if (UnwindBBIsDead) {
+          Instruction *NormalNextIP = &NormalDestBB->front();
+          if (Invoke2CallAllowed) {
+            changeToCall(II);
+            NormalNextIP = BB->getTerminator();
+          }
+          if (NormalBBIsDead)
+            ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+        } else {
+          assert(NormalBBIsDead && "Broken invariant!");
+          if (!NormalDestBB->getUniquePredecessor())
+            NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+          ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+        }
+      }
     for (auto &V : ToBeChangedToUnreachableInsts)
       if (Instruction *I = dyn_cast_or_null<Instruction>(V))
         changeToUnreachable(I, /* UseLLVMTrap */ false);
@@ -6337,7 +6286,9 @@ static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) {
     A.identifyDefaultAbstractAttributes(F);
   }
 
-  return A.run(M) == ChangeStatus::CHANGED;
+  bool Changed = A.run(M) == ChangeStatus::CHANGED;
+  assert(!verifyModule(M, &errs()) && "Module verification failed!");
+  return Changed;
 }
 
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
index 87a3ba5811e16..db3db632e5f24 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/crash.ll
@@ -11,10 +11,6 @@ define void @zot() personality i32 (...)* @wibble {
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    call void @hoge()
 ; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb.split:
-; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb1.i2c:
-; ATTRIBUTOR-NEXT:    unreachable
 ; ATTRIBUTOR:       bb1:
 ; ATTRIBUTOR-NEXT:    unreachable
 ; ATTRIBUTOR:       bb2:
@@ -47,8 +43,6 @@ define internal void @hoge() {
 ; ATTRIBUTOR-LABEL: define {{[^@]+}}@hoge()
 ; ATTRIBUTOR-NEXT:  bb:
 ; ATTRIBUTOR-NEXT:    unreachable
-; ATTRIBUTOR:       bb.split:
-; ATTRIBUTOR-NEXT:    unreachable
 ;
 bb:
   %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
@@ -77,8 +71,6 @@ define i32 @test_inf_promote_caller(i32 %arg) {
 ; CHECK-SAME: (i32 [[ARG:%.*]])
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    unreachable
-; CHECK:       bb.split:
-; CHECK-NEXT:    unreachable
 ;
 bb:
   %tmp = alloca %S
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
index 36adfe08a4d20..153ce6893ba2a 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/fp80.ll
@@ -17,8 +17,6 @@ define void @run() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @CaptureAStruct(%struct.Foo* nofree nonnull readonly align 8 dereferenceable(16) @a)
 ; CHECK-NEXT:    unreachable
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
 ;
 entry:
   tail call i8 @UseLongDoubleUnsafely(%union.u* byval align 16 bitcast (%struct.s* @b to %union.u*))
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
index b81d35491f944..a5ca51e9bd996 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/live_called_from_dead.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --scrub-attributes
 ; RUN: opt -S -basicaa -attributor -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,OLDPM_MODULE
 ; RUN: opt -S -passes='attributor' -aa-pipeline='basic-aa' -attributor-disable=false -attributor-max-iterations-verify -attributor-max-iterations=3 < %s | FileCheck %s --check-prefixes=CHECK,NEWPM_MODULE
 
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
index 271854a224569..d08969c0a2620 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/nonzero-address-spaces.ll
@@ -13,8 +13,6 @@ define i32 @bar() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call addrspace(1) i32 @foo()
 ; CHECK-NEXT:    unreachable
-; CHECK:       entry.split:
-; CHECK-NEXT:    unreachable
 ;
 
 entry:
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll b/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
index b9fd0468d380a..fc82342a989f6 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/recursion.ll
@@ -12,8 +12,6 @@ define internal i32 @foo(i32 %X) {
 define void @bar() {
 ; CHECK-LABEL: define {{[^@]+}}@bar()
 ; CHECK-NEXT:    unreachable
-; CHECK:       .split:
-; CHECK-NEXT:    unreachable
 ;
   call i32 @foo( i32 17 )         ; <i32>:1 [#uses=0]
   ret void
diff --git a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
index 04927726daa25..f30461c746af8 100644
--- a/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
+++ b/llvm/test/Transforms/Attributor/IPConstantProp/return-constant.ll
@@ -8,8 +8,6 @@ define i1 @invokecaller(i1 %C) personality i32 (...)* @__gxx_personality_v0 {
 ; CHECK-SAME: (i1 [[C:%.*]]) #0 personality i32 (...)* @__gxx_personality_v0
 ; CHECK-NEXT:    [[X:%.*]] = call i32 @foo(i1 [[C]])
 ; CHECK-NEXT:    br label [[OK:%.*]]
-; CHECK:       .i2c:
-; CHECK-NEXT:    unreachable
 ; CHECK:       OK:
 ; CHECK-NEXT:    [[Y:%.*]] = icmp ne i32 52, 0
 ; CHECK-NEXT:    ret i1 [[Y]]
diff --git a/llvm/test/Transforms/Attributor/liveness.ll b/llvm/test/Transforms/Attributor/liveness.ll
index cf9dc8b789dd3..4dd37865fcc09 100644
--- a/llvm/test/Transforms/Attributor/liveness.ll
+++ b/llvm/test/Transforms/Attributor/liveness.ll
@@ -277,6 +277,96 @@ cleanup:
   ret i32 0
 }
 
+; UTC_ARGS: --turn on
+
+; TEST 5.4 unounwind invoke instruction replaced by a call and a branch instruction put after it.
+define i32 @invoke_nounwind_phi(i32 %a) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define {{[^@]+}}@invoke_nounwind_phi
+; CHECK-SAME: (i32 [[A:%.*]]) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo_nounwind()
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar()
+; CHECK-NEXT:    br label [[CONTINUE]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ 0, [[COND_TRUE]] ], [ 1, [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  call void @normal_call()
+  %call = invoke i32 @foo_nounwind() to label %continue
+  unwind label %cleanup
+
+cond.false:                                       ; preds = %entry
+  call void @normal_call()
+  %call1 = call i32 @bar()
+  br label %continue
+
+continue:
+  %p = phi i32 [ 0, %cond.true ], [ 1, %cond.false ]
+  ret i32 %p
+
+cleanup:
+  %res = landingpad { i8*, i32 } catch i8* null
+  ret i32 0
+}
+
+; TEST 5.5 unounwind invoke instruction replaced by a call and a branch instruction put after it.
+define i32 @invoke_nounwind_phi_dom(i32 %a) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+; CHECK-LABEL: define {{[^@]+}}@invoke_nounwind_phi_dom
+; CHECK-SAME: (i32 [[A:%.*]]) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[A]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond.true:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @foo_nounwind()
+; CHECK-NEXT:    br label [[CONTINUE:%.*]]
+; CHECK:       cond.false:
+; CHECK-NEXT:    call void @normal_call()
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar()
+; CHECK-NEXT:    br label [[CONTINUE]]
+; CHECK:       continue:
+; CHECK-NEXT:    [[P:%.*]] = phi i32 [ [[CALL]], [[COND_TRUE]] ], [ [[CALL1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[P]]
+; CHECK:       cleanup:
+; CHECK-NEXT:    unreachable
+;
+entry:
+  %cmp = icmp eq i32 %a, 0
+  br i1 %cmp, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %entry
+  call void @normal_call()
+  %call = invoke i32 @foo_nounwind() to label %continue
+  unwind label %cleanup
+
+cond.false:                                       ; preds = %entry
+  call void @normal_call()
+  %call1 = call i32 @bar()
+  br label %continue
+
+continue:
+  %p = phi i32 [ %call, %cond.true ], [ %call1, %cond.false ]
+  ret i32 %p
+
+cleanup:
+  %res = landingpad { i8*, i32 } catch i8* null
+  ret i32 0
+}
+
 ; UTC_ARGS: --turn off
 
 ; TEST 6: Undefined behvior, taken from LangRef.
@@ -707,7 +797,6 @@ define internal void @dead_e2() { ret void }
 ; CHECK-NEXT: define internal void @non_dead_d15()
 ; CHECK-NOT: define internal void @dead_e
 
-
 declare void @blowup() noreturn
 define void @live_with_dead_entry() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK:      define void @live_with_dead_entry(
@@ -735,19 +824,19 @@ define void @live_with_dead_entry_lp() personality i8* bitcast (i32 (...)* @__gx
 ; CHECK:      define void @live_with_dead_entry_lp(
 ; CHECK-NEXT: entry:
 ; CHECK-NEXT:   invoke void @blowup()
-; CHECK-NEXT:           to label %live_with_dead_entry.dead unwind label %lp1
-; CHECK:      lp1:                                              ; preds = %entry
+; CHECK-NEXT:    to label %[[LIVE_WITH_DEAD_ENTRY_DEAD1:.*]] unwind label %[[LP1:.*]]
+; CHECK:      [[LP1]]:                                              ; preds = %entry
 ; CHECK-NEXT:   %lp = landingpad { i8*, i32 }
 ; CHECK-NEXT:           catch i8* null
 ; CHECK-NEXT:   invoke void @blowup()
-; CHECK-NEXT:           to label %live_with_dead_entry.dead1 unwind label %lp2
-; CHECK:      lp2:                                              ; preds = %lp1
+; CHECK-NEXT:    to label %[[LIVE_WITH_DEAD_ENTRY_DEAD2:.*]] unwind label %[[LP2:.*]]
+; CHECK:      [[LP2]]:                                              ; preds = %lp1
 ; CHECK-NEXT:   %0 = landingpad { i8*, i32 }
 ; CHECK-NEXT:           catch i8* null
 ; CHECK-NEXT:   br label %live_with_dead_entry
-; CHECK:      live_with_dead_entry.dead:
+; CHECK:      [[LIVE_WITH_DEAD_ENTRY_DEAD1]]:
 ; CHECK-NEXT:   unreachable
-; CHECK:      live_with_dead_entry.dead1:
+; CHECK:      [[LIVE_WITH_DEAD_ENTRY_DEAD2]]:
 ; CHECK-NEXT:   unreachable
 ; CHECK:      live_with_dead_entry:                             ; preds = %lp2
 ; CHECK-NEXT:   ret void
diff --git a/llvm/test/Transforms/Attributor/noreturn_async.ll b/llvm/test/Transforms/Attributor/noreturn_async.ll
index 7c00a5a0b5cbf..9fb99159acf5d 100644
--- a/llvm/test/Transforms/Attributor/noreturn_async.ll
+++ b/llvm/test/Transforms/Attributor/noreturn_async.ll
@@ -100,9 +100,9 @@ define dso_local i32 @"?catchoverflow@@YAHXZ_may_throw"()  personality i8* bitca
 entry:
   %retval = alloca i32, align 4
   %__exception_code = alloca i32, align 4
-; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"() 
+; CHECK: invoke void @"?overflow@@YAXXZ_may_throw"()
 ; CHECK:          to label %invoke.cont unwind label %catch.dispatch
-  invoke void @"?overflow@@YAXXZ_may_throw"() 
+  invoke void @"?overflow@@YAXXZ_may_throw"()
           to label %invoke.cont unwind label %catch.dispatch
 
 invoke.cont:                                      ; preds = %entry

From 338a601612ca36e112b14f622eb310985b93192a Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Wed, 8 Jan 2020 17:42:23 -0800
Subject: [PATCH 14/89] Revert "[NFC][InlineCost] Factor cost modeling out of
 CallAnalyzer traversal."

This reverts commit 76aab66d34446ccf764cf8127b73e1517df75fb4.

Failure:
http://lab.llvm.org:8011/builders/clang-with-thin-lto-ubuntu/builds/20562,
will investigate and resubmit.
---
 llvm/lib/Analysis/InlineCost.cpp | 752 ++++++++++++++-----------------
 1 file changed, 330 insertions(+), 422 deletions(-)

diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 017301fce9b70..b5f4192bf8562 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -93,12 +93,11 @@ static cl::opt<bool> OptComputeFullInlineCost(
              "exceeds the threshold."));
 
 namespace {
-class InlineCostCallAnalyzer;
+
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   typedef InstVisitor<CallAnalyzer, bool> Base;
   friend class InstVisitor<CallAnalyzer, bool>;
 
-protected:
   /// The TargetTransformInfo available for this compilation.
   const TargetTransformInfo &TTI;
 
@@ -125,86 +124,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// easily cacheable. Instead, use the cover function paramHasAttr.
   CallBase &CandidateCall;
 
-  /// Extension points for handling callsite features.
-  /// Called after a basic block was analyzed.
-  virtual void onBlockAnalyzed(const BasicBlock *BB) {}
-
-  /// Called at the end of the analysis of the callsite. Return the outcome of
-  /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
-  /// the reason it can't.
-  virtual InlineResult finalizeAnalysis() { return true; }
-
-  /// Called when we're about to start processing a basic block, and every time
-  /// we are done processing an instruction. Return true if there is no point in
-  /// continuing the analysis (e.g. we've determined already the call site is
-  /// too expensive to inline)
-  virtual bool shouldStop() { return false; }
-
-  /// Called before the analysis of the callee body starts (with callsite
-  /// contexts propagated).  It checks callsite-specific information. Return a
-  /// reason analysis can't continue if that's the case, or 'true' if it may
-  /// continue.
-  virtual InlineResult onAnalysisStart() { return true; }
-
-  /// Called if the analysis engine decides SROA cannot be done for the given
-  /// alloca.
-  virtual void onDisableSROA(AllocaInst *Arg) {}
-
-  /// Called the analysis engine determines load elimination won't happen.
-  virtual void onDisableLoadElimination() {}
-
-  /// Called to account for a call.
-  virtual void onCallPenalty() {}
-
-  /// Called to account for the expectation the inlining would result in a load
-  /// elimination.
-  virtual void onLoadEliminationOpportunity() {}
-
-  /// Called to account for the cost of argument setup for the Call in the
-  /// callee's body (not the callsite currently under analysis).
-  virtual void onCallArgumentSetup(const CallBase &Call) {}
-
-  /// Called to account for a load relative intrinsic.
-  virtual void onLoadRelativeIntrinsic() {}
-
-  /// Called to account for a lowered call.
-  virtual void onLoweredCall(Function *F, CallBase &Call, bool IsIndirectCall) {
-  }
-
-  /// Account for a jump table of given size. Return false to stop further
-  /// processing the switch instruction
-  virtual bool onJumpTable(unsigned JumpTableSize) { return true; }
-
-  /// Account for a case cluster of given size. Return false to stop further
-  /// processing of the instruction.
-  virtual bool onCaseCluster(unsigned NumCaseCluster) { return true; }
-
-  /// Called at the end of processing a switch instruction, with the given
-  /// number of case clusters.
-  virtual void onFinalizeSwitch(unsigned JumpTableSize,
-                                unsigned NumCaseCluster) {}
-
-  /// Called to account for any other instruction not specifically accounted
-  /// for.
-  virtual void onCommonInstructionSimplification() {}
+  /// Tunable parameters that control the analysis.
+  const InlineParams &Params;
 
-  /// Start accounting potential benefits due to SROA for the given alloca.
-  virtual void onInitializeSROAArg(AllocaInst *Arg) {}
+  /// Upper bound for the inlining cost. Bonuses are being applied to account
+  /// for speculative "expected profit" of the inlining decision.
+  int Threshold;
 
-  /// Account SROA savings for the AllocaInst value.
-  virtual void onAggregateSROAUse(AllocaInst *V) {}
+  /// Inlining cost measured in abstract units, accounts for all the
+  /// instructions expected to be executed for a given function invocation.
+  /// Instructions that are statically proven to be dead based on call-site
+  /// arguments are not counted here.
+  int Cost = 0;
 
-  bool handleSROA(Value *V, bool DoNotDisable) {
-    // Check for SROA candidates in comparisons.
-    if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
-      if (DoNotDisable) {
-        onAggregateSROAUse(SROAArg);
-        return true;
-      }
-      disableSROA(SROAArg);
-    }
-    return false;
-  }
+  bool ComputeFullInlineCost;
 
   bool IsCallerRecursive = false;
   bool IsRecursiveCall = false;
@@ -216,11 +149,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   bool HasUninlineableIntrinsic = false;
   bool InitsVargArgs = false;
 
+  /// Attempt to evaluate indirect calls to boost its inline cost.
+  bool BoostIndirectCalls;
+
   /// Number of bytes allocated statically by the callee.
   uint64_t AllocatedSize = 0;
   unsigned NumInstructions = 0;
   unsigned NumVectorInstructions = 0;
 
+  /// Bonus to be applied when percentage of vector instructions in callee is
+  /// high (see more details in updateThreshold).
+  int VectorBonus = 0;
+  /// Bonus to be applied when the callee has only one reachable basic block.
+  int SingleBBBonus = 0;
+
   /// While we walk the potentially-inlined instructions, we build up and
   /// maintain a mapping of simplified values specific to this callsite. The
   /// idea is to propagate any special information we have about arguments to
@@ -232,10 +174,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
 
   /// Keep track of the values which map back (through function arguments) to
   /// allocas on the caller stack which could be simplified through SROA.
-  /// We can disable an association (because for some reason the SROA oportunity
-  /// is lost) by setting the value to nullptr. We don't delete because we still
-  /// want isAllocaDerivedArg to function correctly.
-  DenseMap<Value *, AllocaInst *> SROAArgValues;
+  DenseMap<Value *, Value *> SROAArgValues;
+
+  /// The mapping of caller Alloca values to their accumulated cost savings. If
+  /// we have to disable SROA for one of the allocas, this tells us how much
+  /// cost must be added.
+  DenseMap<Value *, int> SROAArgCosts;
 
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -252,18 +196,17 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// loads.
   bool EnableLoadElimination;
   SmallPtrSet<Value *, 16> LoadAddrSet;
-
-  AllocaInst *getSROAArgForValueOrNull(Value *V) const {
-    auto It = SROAArgValues.find(V);
-    if (It == SROAArgValues.end())
-      return nullptr;
-    return It->second;
-  }
+  int LoadEliminationCost = 0;
 
   // Custom simplification helper routines.
   bool isAllocaDerivedArg(Value *V);
+  bool lookupSROAArgAndCost(Value *V, Value *&Arg,
+                            DenseMap<Value *, int>::iterator &CostIt);
+  void disableSROA(DenseMap<Value *, int>::iterator CostIt);
   void disableSROA(Value *V);
   void findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB);
+  void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                          int InstructionCost);
   void disableLoadElimination();
   bool isGEPFree(GetElementPtrInst &GEP);
   bool canFoldInboundsGEP(GetElementPtrInst &I);
@@ -284,13 +227,32 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   /// inlined through this particular callsite.
   bool isKnownNonNullInCallee(Value *V);
 
+  /// Update Threshold based on callsite properties such as callee
+  /// attributes and callee hotness for PGO builds. The Callee is explicitly
+  /// passed to support analyzing indirect calls whose target is inferred by
+  /// analysis.
+  void updateThreshold(CallBase &Call, Function &Callee);
+
   /// Return true if size growth is allowed when inlining the callee at \p Call.
   bool allowSizeGrowth(CallBase &Call);
 
+  /// Return true if \p Call is a cold callsite.
+  bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
+
+  /// Return a higher threshold if \p Call is a hot callsite.
+  Optional<int> getHotCallSiteThreshold(CallBase &Call,
+                                        BlockFrequencyInfo *CallerBFI);
+
   // Custom analysis routines.
   InlineResult analyzeBlock(BasicBlock *BB,
                             SmallPtrSetImpl<const Value *> &EphValues);
 
+  /// Handle a capped 'int' increment for Cost.
+  void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
+    assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
+    Cost = (int)std::min(UpperBound, Cost + Inc);
+  }
+
   // Disable several entry points to the visitor so we don't accidentally use
   // them by declaring but not defining them here.
   void visit(Module *);
@@ -336,13 +298,20 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
                std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
                Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
                ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
-               Function &Callee, CallBase &Call)
+               Function &Callee, CallBase &Call, const InlineParams &Params,
+               bool BoostIndirect = true)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
-        CandidateCall(Call), EnableLoadElimination(true) {}
+        CandidateCall(Call), Params(Params), Threshold(Params.DefaultThreshold),
+        ComputeFullInlineCost(OptComputeFullInlineCost ||
+                              Params.ComputeFullInlineCost || ORE),
+        BoostIndirectCalls(BoostIndirect), EnableLoadElimination(true) {}
 
   InlineResult analyze();
 
+  int getThreshold() { return Threshold; }
+  int getCost() { return Cost; }
+
   // Keep a bunch of stats about the cost savings found so we can print them
   // out when debugging.
   unsigned NumConstantArgs = 0;
@@ -351,283 +320,12 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   unsigned NumConstantPtrCmps = 0;
   unsigned NumConstantPtrDiffs = 0;
   unsigned NumInstructionsSimplified = 0;
-
-  void dump();
-};
-
-/// FIXME: if it is necessary to derive from InlineCostCallAnalyzer, note
-/// the FIXME in onLoweredCall, when instantiating an InlineCostCallAnalyzer
-class InlineCostCallAnalyzer final : public CallAnalyzer {
-  const int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
-  const bool ComputeFullInlineCost;
-  int LoadEliminationCost = 0;
-  /// Bonus to be applied when percentage of vector instructions in callee is
-  /// high (see more details in updateThreshold).
-  int VectorBonus = 0;
-  /// Bonus to be applied when the callee has only one reachable basic block.
-  int SingleBBBonus = 0;
-
-  /// Tunable parameters that control the analysis.
-  const InlineParams &Params;
-
-  /// Upper bound for the inlining cost. Bonuses are being applied to account
-  /// for speculative "expected profit" of the inlining decision.
-  int Threshold = 0;
-
-  /// Attempt to evaluate indirect calls to boost its inline cost.
-  const bool BoostIndirectCalls;
-
-  /// Inlining cost measured in abstract units, accounts for all the
-  /// instructions expected to be executed for a given function invocation.
-  /// Instructions that are statically proven to be dead based on call-site
-  /// arguments are not counted here.
-  int Cost = 0;
-
-  bool SingleBB = true;
-
   unsigned SROACostSavings = 0;
   unsigned SROACostSavingsLost = 0;
 
-  /// The mapping of caller Alloca values to their accumulated cost savings. If
-  /// we have to disable SROA for one of the allocas, this tells us how much
-  /// cost must be added.
-  DenseMap<AllocaInst *, int> SROAArgCosts;
-
-  /// Return true if \p Call is a cold callsite.
-  bool isColdCallSite(CallBase &Call, BlockFrequencyInfo *CallerBFI);
-
-  /// Update Threshold based on callsite properties such as callee
-  /// attributes and callee hotness for PGO builds. The Callee is explicitly
-  /// passed to support analyzing indirect calls whose target is inferred by
-  /// analysis.
-  void updateThreshold(CallBase &Call, Function &Callee);
-  /// Return a higher threshold if \p Call is a hot callsite.
-  Optional<int> getHotCallSiteThreshold(CallBase &Call,
-                                        BlockFrequencyInfo *CallerBFI);
-
-  /// Handle a capped 'int' increment for Cost.
-  void addCost(int64_t Inc, int64_t UpperBound = INT_MAX) {
-    assert(UpperBound > 0 && UpperBound <= INT_MAX && "invalid upper bound");
-    Cost = (int)std::min(UpperBound, Cost + Inc);
-  }
-
-  void onDisableSROA(AllocaInst *Arg) override {
-    auto CostIt = SROAArgCosts.find(Arg);
-    if (CostIt == SROAArgCosts.end())
-      return;
-    addCost(CostIt->second);
-    SROACostSavings -= CostIt->second;
-    SROACostSavingsLost += CostIt->second;
-    SROAArgCosts.erase(CostIt);
-  }
-
-  void onDisableLoadElimination() override {
-    addCost(LoadEliminationCost);
-    LoadEliminationCost = 0;
-  }
-  void onCallPenalty() override { addCost(InlineConstants::CallPenalty); }
-  void onCallArgumentSetup(const CallBase &Call) override {
-    // Pay the price of the argument setup. We account for the average 1
-    // instruction per call argument setup here.
-    addCost(Call.arg_size() * InlineConstants::InstrCost);
-  }
-  void onLoadRelativeIntrinsic() override {
-    // This is normally lowered to 4 LLVM instructions.
-    addCost(3 * InlineConstants::InstrCost);
-  }
-  void onLoweredCall(Function *F, CallBase &Call,
-                     bool IsIndirectCall) override {
-    // We account for the average 1 instruction per call argument setup here.
-    addCost(Call.arg_size() * InlineConstants::InstrCost);
-
-    // If we have a constant that we are calling as a function, we can peer
-    // through it and see the function target. This happens not infrequently
-    // during devirtualization and so we want to give it a hefty bonus for
-    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
-    // Pretend to inline the function, with a custom threshold.
-    if (IsIndirectCall && BoostIndirectCalls) {
-      auto IndirectCallParams = Params;
-      IndirectCallParams.DefaultThreshold =
-          InlineConstants::IndirectCallThreshold;
-      /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
-      /// to instantiate the derived class.
-      InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
-                                Call, IndirectCallParams, false);
-      if (CA.analyze()) {
-        // We were able to inline the indirect call! Subtract the cost from the
-        // threshold to get the bonus we want to apply, but don't go below zero.
-        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
-      }
-    } else
-      // Otherwise simply add the cost for merely making the call.
-      addCost(InlineConstants::CallPenalty);
-  }
-
-  void onFinalizeSwitch(unsigned JumpTableSize,
-                        unsigned NumCaseCluster) override {
-    // If suitable for a jump table, consider the cost for the table size and
-    // branch to destination.
-    // Maximum valid cost increased in this function.
-    if (JumpTableSize) {
-      int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
-                       4 * InlineConstants::InstrCost;
-
-      addCost(JTCost, (int64_t)CostUpperBound);
-      return;
-    }
-    // Considering forming a binary search, we should find the number of nodes
-    // which is same as the number of comparisons when lowered. For a given
-    // number of clusters, n, we can define a recursive function, f(n), to find
-    // the number of nodes in the tree. The recursion is :
-    // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
-    // and f(n) = n, when n <= 3.
-    // This will lead a binary tree where the leaf should be either f(2) or f(3)
-    // when n > 3.  So, the number of comparisons from leaves should be n, while
-    // the number of non-leaf should be :
-    //   2^(log2(n) - 1) - 1
-    //   = 2^log2(n) * 2^-1 - 1
-    //   = n / 2 - 1.
-    // Considering comparisons from leaf and non-leaf nodes, we can estimate the
-    // number of comparisons in a simple closed form :
-    //   n + n / 2 - 1 = n * 3 / 2 - 1
-    if (NumCaseCluster <= 3) {
-      // Suppose a comparison includes one compare and one conditional branch.
-      addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
-      return;
-    }
-
-    int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
-    int64_t SwitchCost =
-        ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
-
-    addCost(SwitchCost, (int64_t)CostUpperBound);
-  }
-  void onCommonInstructionSimplification() override {
-    addCost(InlineConstants::InstrCost);
-  }
-
-  void onInitializeSROAArg(AllocaInst *Arg) override { SROAArgCosts[Arg] = 0; }
-
-  void onAggregateSROAUse(AllocaInst *SROAArg) override {
-    auto CostIt = SROAArgCosts.find(SROAArg);
-    CostIt->second += InlineConstants::InstrCost;
-    SROACostSavings += InlineConstants::InstrCost;
-  }
-
-  virtual void onBlockAnalyzed(const BasicBlock *BB) override {
-    auto *TI = BB->getTerminator();
-    // If we had any successors at this point, than post-inlining is likely to
-    // have them as well. Note that we assume any basic blocks which existed
-    // due to branches or switches which folded above will also fold after
-    // inlining.
-    if (SingleBB && TI->getNumSuccessors() > 1) {
-      // Take off the bonus we applied to the threshold.
-      Threshold -= SingleBBBonus;
-      SingleBB = false;
-    }
-  }
-  virtual InlineResult finalizeAnalysis() override {
-    // Loops generally act a lot like calls in that they act like barriers to
-    // movement, require a certain amount of setup, etc. So when optimising for
-    // size, we penalise any call sites that perform loops. We do this after all
-    // other costs here, so will likely only be dealing with relatively small
-    // functions (and hence DT and LI will hopefully be cheap).
-    auto *Caller = CandidateCall.getFunction();
-    if (Caller->hasMinSize()) {
-      DominatorTree DT(F);
-      LoopInfo LI(DT);
-      int NumLoops = 0;
-      for (Loop *L : LI) {
-        // Ignore loops that will not be executed
-        if (DeadBlocks.count(L->getHeader()))
-          continue;
-        NumLoops++;
-      }
-      addCost(NumLoops * InlineConstants::CallPenalty);
-    }
-
-    // We applied the maximum possible vector bonus at the beginning. Now,
-    // subtract the excess bonus, if any, from the Threshold before
-    // comparing against Cost.
-    if (NumVectorInstructions <= NumInstructions / 10)
-      Threshold -= VectorBonus;
-    else if (NumVectorInstructions <= NumInstructions / 2)
-      Threshold -= VectorBonus / 2;
-
-    return Cost < std::max(1, Threshold);
-  }
-  virtual bool shouldStop() override {
-    // Bail out the moment we cross the threshold. This means we'll under-count
-    // the cost, but only when undercounting doesn't matter.
-    return Cost >= Threshold && !ComputeFullInlineCost;
-  }
-
-  virtual void onLoadEliminationOpportunity() {
-    LoadEliminationCost += InlineConstants::InstrCost;
-  }
-
-  InlineResult onAnalysisStart() override {
-    // Perform some tweaks to the cost and threshold based on the direct
-    // callsite information.
-
-    // We want to more aggressively inline vector-dense kernels, so up the
-    // threshold, and we'll lower it if the % of vector instructions gets too
-    // low. Note that these bonuses are some what arbitrary and evolved over
-    // time by accident as much as because they are principled bonuses.
-    //
-    // FIXME: It would be nice to remove all such bonuses. At least it would be
-    // nice to base the bonus values on something more scientific.
-    assert(NumInstructions == 0);
-    assert(NumVectorInstructions == 0);
-
-    // Update the threshold based on callsite properties
-    updateThreshold(CandidateCall, F);
-
-    // While Threshold depends on commandline options that can take negative
-    // values, we want to enforce the invariant that the computed threshold and
-    // bonuses are non-negative.
-    assert(Threshold >= 0);
-    assert(SingleBBBonus >= 0);
-    assert(VectorBonus >= 0);
-
-    // Speculatively apply all possible bonuses to Threshold. If cost exceeds
-    // this Threshold any time, and cost cannot decrease, we can stop processing
-    // the rest of the function body.
-    Threshold += (SingleBBBonus + VectorBonus);
-
-    // Give out bonuses for the callsite, as the instructions setting them up
-    // will be gone after inlining.
-    addCost(-getCallsiteCost(this->CandidateCall, DL));
-
-    // If this function uses the coldcc calling convention, prefer not to inline
-    // it.
-    if (F.getCallingConv() == CallingConv::Cold)
-      Cost += InlineConstants::ColdccPenalty;
-
-    // Check if we're done. This can happen due to bonuses and penalties.
-    if (Cost >= Threshold && !ComputeFullInlineCost)
-      return "high cost";
-
-    return true;
-  }
-
-public:
-  InlineCostCallAnalyzer(
-      const TargetTransformInfo &TTI,
-      std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
-      Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
-      ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
-      CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
-      : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
-        ComputeFullInlineCost(OptComputeFullInlineCost ||
-                              Params.ComputeFullInlineCost || ORE),
-        Params(Params), Threshold(Params.DefaultThreshold),
-        BoostIndirectCalls(BoostIndirect) {}
   void dump();
-
-  int getThreshold() { return Threshold; }
-  int getCost() { return Cost; }
 };
+
 } // namespace
 
 /// Test whether the given value is an Alloca-derived function argument.
@@ -635,22 +333,55 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
   return SROAArgValues.count(V);
 }
 
+/// Lookup the SROA-candidate argument and cost iterator which V maps to.
+/// Returns false if V does not map to a SROA-candidate.
+bool CallAnalyzer::lookupSROAArgAndCost(
+    Value *V, Value *&Arg, DenseMap<Value *, int>::iterator &CostIt) {
+  if (SROAArgValues.empty() || SROAArgCosts.empty())
+    return false;
+
+  DenseMap<Value *, Value *>::iterator ArgIt = SROAArgValues.find(V);
+  if (ArgIt == SROAArgValues.end())
+    return false;
+
+  Arg = ArgIt->second;
+  CostIt = SROAArgCosts.find(Arg);
+  return CostIt != SROAArgCosts.end();
+}
+
+/// Disable SROA for the candidate marked by this cost iterator.
+///
+/// This marks the candidate as no longer viable for SROA, and adds the cost
+/// savings associated with it back into the inline cost measurement.
+void CallAnalyzer::disableSROA(DenseMap<Value *, int>::iterator CostIt) {
+  // If we're no longer able to perform SROA we need to undo its cost savings
+  // and prevent subsequent analysis.
+  addCost(CostIt->second);
+  SROACostSavings -= CostIt->second;
+  SROACostSavingsLost += CostIt->second;
+  SROAArgCosts.erase(CostIt);
+  disableLoadElimination();
+}
+
 /// If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
-  auto It = SROAArgValues.find(V);
-  if (It == SROAArgValues.end())
-    return;
-  auto *SROAArg = It->second;
-  if (!SROAArg)
-    return;
-  It->second = nullptr;
-  onDisableSROA(SROAArg);
-  disableLoadElimination();
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(V, SROAArg, CostIt))
+    disableSROA(CostIt);
+}
+
+/// Accumulate the given cost for a particular SROA candidate.
+void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
+                                      int InstructionCost) {
+  CostIt->second += InstructionCost;
+  SROACostSavings += InstructionCost;
 }
 
 void CallAnalyzer::disableLoadElimination() {
   if (EnableLoadElimination) {
-    onDisableLoadElimination();
+    addCost(LoadEliminationCost);
+    LoadEliminationCost = 0;
     EnableLoadElimination = false;
   }
 }
@@ -822,7 +553,9 @@ bool CallAnalyzer::visitPHI(PHINode &I) {
   if (FirstBaseAndOffset.first) {
     ConstantOffsetPtrs[&I] = FirstBaseAndOffset;
 
-    if (auto *SROAArg = getSROAArgForValueOrNull(FirstV))
+    Value *SROAArg;
+    DenseMap<Value *, int>::iterator CostIt;
+    if (lookupSROAArgAndCost(FirstV, SROAArg, CostIt))
       SROAArgValues[&I] = SROAArg;
   }
 
@@ -852,8 +585,10 @@ bool CallAnalyzer::canFoldInboundsGEP(GetElementPtrInst &I) {
 }
 
 bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
+  Value *SROAArg;
   DenseMap<Value *, int>::iterator CostIt;
-  auto *SROAArg = getSROAArgForValueOrNull(I.getPointerOperand());
+  bool SROACandidate =
+      lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt);
 
   // Lambda to check whether a GEP's indices are all constant.
   auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
@@ -864,7 +599,7 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   };
 
   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
-    if (SROAArg)
+    if (SROACandidate)
       SROAArgValues[&I] = SROAArg;
 
     // Constant GEPs are modeled as free.
@@ -872,8 +607,8 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   }
 
   // Variable GEPs will require math and will disable SROA.
-  if (SROAArg)
-    disableSROA(SROAArg);
+  if (SROACandidate)
+    disableSROA(CostIt);
   return isGEPFree(I);
 }
 
@@ -913,7 +648,9 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
     ConstantOffsetPtrs[&I] = BaseAndOffset;
 
   // Also look for SROA candidates here.
-  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
   // Bitcasts are always zero cost.
@@ -945,7 +682,9 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // and so we can just add the integer in here. The only places where SROA is
   // preserved either cannot fire on an integer, or won't in-and-of themselves
   // disable SROA (ext) w/o some later use that we would see and disable.
-  if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -969,7 +708,9 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   }
 
   // "Propagate" SROA here in the same manner as we do for ptrtoint above.
-  if (auto *SROAArg = getSROAArgForValueOrNull(Op))
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(Op, SROAArg, CostIt))
     SROAArgValues[&I] = SROAArg;
 
   return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
@@ -996,7 +737,7 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
   case Instruction::FPToUI:
   case Instruction::FPToSI:
     if (TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive)
-      onCallPenalty();
+      addCost(InlineConstants::CallPenalty);
     break;
   default:
     break;
@@ -1069,8 +810,8 @@ bool CallAnalyzer::allowSizeGrowth(CallBase &Call) {
   return true;
 }
 
-bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
-                                            BlockFrequencyInfo *CallerBFI) {
+bool CallAnalyzer::isColdCallSite(CallBase &Call,
+                                  BlockFrequencyInfo *CallerBFI) {
   // If global profile summary is available, then callsite's coldness is
   // determined based on that.
   if (PSI && PSI->hasProfileSummary())
@@ -1093,8 +834,8 @@ bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
 }
 
 Optional<int>
-InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
-                                                BlockFrequencyInfo *CallerBFI) {
+CallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
+                                      BlockFrequencyInfo *CallerBFI) {
 
   // If global profile summary is available, then callsite's hotness is
   // determined based on that.
@@ -1121,7 +862,7 @@ InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
   return None;
 }
 
-void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
+void CallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // If no size growth is allowed for this inlining, set Threshold to 0.
   if (!allowSizeGrowth(Call)) {
     Threshold = 0;
@@ -1283,7 +1024,19 @@ bool CallAnalyzer::visitCmpInst(CmpInst &I) {
                                       : ConstantInt::getFalse(I.getType());
     return true;
   }
-  return handleSROA(I.getOperand(0), isa<ConstantPointerNull>(I.getOperand(1)));
+  // Finally check for SROA candidates in comparisons.
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getOperand(0), SROAArg, CostIt)) {
+    if (isa<ConstantPointerNull>(I.getOperand(1))) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
+
+  return false;
 }
 
 bool CallAnalyzer::visitSub(BinaryOperator &I) {
@@ -1347,7 +1100,7 @@ bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   if (I.getType()->isFloatingPointTy() &&
       TTI.getFPOpCost(I.getType()) == TargetTransformInfo::TCC_Expensive &&
       !match(&I, m_FNeg(m_Value())))
-    onCallPenalty();
+    addCost(InlineConstants::CallPenalty);
 
   return false;
 }
@@ -1374,15 +1127,23 @@ bool CallAnalyzer::visitFNeg(UnaryOperator &I) {
 }
 
 bool CallAnalyzer::visitLoad(LoadInst &I) {
-  if (handleSROA(I.getPointerOperand(), I.isSimple()))
-    return true;
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
 
   // If the data is already loaded from this address and hasn't been clobbered
   // by any stores or calls, this load is likely to be redundant and can be
   // eliminated.
   if (EnableLoadElimination &&
       !LoadAddrSet.insert(I.getPointerOperand()).second && I.isUnordered()) {
-    onLoadEliminationOpportunity();
+    LoadEliminationCost += InlineConstants::InstrCost;
     return true;
   }
 
@@ -1390,8 +1151,16 @@ bool CallAnalyzer::visitLoad(LoadInst &I) {
 }
 
 bool CallAnalyzer::visitStore(StoreInst &I) {
-  if (handleSROA(I.getPointerOperand(), I.isSimple()))
-    return true;
+  Value *SROAArg;
+  DenseMap<Value *, int>::iterator CostIt;
+  if (lookupSROAArgAndCost(I.getPointerOperand(), SROAArg, CostIt)) {
+    if (I.isSimple()) {
+      accumulateSROACost(CostIt, InlineConstants::InstrCost);
+      return true;
+    }
+
+    disableSROA(CostIt);
+  }
 
   // The store can potentially clobber loads and prevent repeated loads from
   // being eliminated.
@@ -1481,7 +1250,9 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
     // in this inline context. If not, we've done all we can.
     F = dyn_cast_or_null<Function>(SimplifiedValues.lookup(Callee));
     if (!F) {
-      onCallArgumentSetup(Call);
+      // Pay the price of the argument setup. We account for the average 1
+      // instruction per call argument setup here.
+      addCost(Call.arg_size() * InlineConstants::InstrCost);
 
       if (!Call.onlyReadsMemory())
         disableLoadElimination();
@@ -1505,7 +1276,8 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
       return Base::visitCallBase(Call);
 
     case Intrinsic::load_relative:
-      onLoadRelativeIntrinsic();
+      // This is normally lowered to 4 LLVM instructions.
+      addCost(3 * InlineConstants::InstrCost);
       return false;
 
     case Intrinsic::memset:
@@ -1532,7 +1304,28 @@ bool CallAnalyzer::visitCallBase(CallBase &Call) {
   }
 
   if (TTI.isLoweredToCall(F)) {
-    onLoweredCall(F, Call, IsIndirectCall);
+    // We account for the average 1 instruction per call argument setup here.
+    addCost(Call.arg_size() * InlineConstants::InstrCost);
+
+    // If we have a constant that we are calling as a function, we can peer
+    // through it and see the function target. This happens not infrequently
+    // during devirtualization and so we want to give it a hefty bonus for
+    // inlining, but cap that bonus in the event that inlining wouldn't pan out.
+    // Pretend to inline the function, with a custom threshold.
+    if (IsIndirectCall && BoostIndirectCalls) {
+      auto IndirectCallParams = Params;
+      IndirectCallParams.DefaultThreshold =
+          InlineConstants::IndirectCallThreshold;
+      CallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F, Call,
+                      IndirectCallParams, false);
+      if (CA.analyze()) {
+        // We were able to inline the indirect call! Subtract the cost from the
+        // threshold to get the bonus we want to apply, but don't go below zero.
+        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+      }
+    } else
+      // Otherwise simply add the cost for merely making the call.
+      addCost(InlineConstants::CallPenalty);
   }
 
   if (!(Call.onlyReadsMemory() || (IsIndirectCall && F->onlyReadsMemory())))
@@ -1588,7 +1381,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
     if (TrueBaseAndOffset == FalseBaseAndOffset && TrueBaseAndOffset.first) {
       ConstantOffsetPtrs[&SI] = TrueBaseAndOffset;
 
-      if (auto *SROAArg = getSROAArgForValueOrNull(TrueVal))
+      Value *SROAArg;
+      DenseMap<Value *, int>::iterator CostIt;
+      if (lookupSROAArgAndCost(TrueVal, SROAArg, CostIt))
         SROAArgValues[&SI] = SROAArg;
       return true;
     }
@@ -1627,7 +1422,9 @@ bool CallAnalyzer::visitSelectInst(SelectInst &SI) {
   if (BaseAndOffset.first) {
     ConstantOffsetPtrs[&SI] = BaseAndOffset;
 
-    if (auto *SROAArg = getSROAArgForValueOrNull(SelectedV))
+    Value *SROAArg;
+    DenseMap<Value *, int>::iterator CostIt;
+    if (lookupSROAArgAndCost(SelectedV, SROAArg, CostIt))
       SROAArgValues[&SI] = SROAArg;
   }
 
@@ -1655,12 +1452,49 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // inlining those. It will prevent inlining in cases where the optimization
   // does not (yet) fire.
 
+  // Maximum valid cost increased in this function.
+  int CostUpperBound = INT_MAX - InlineConstants::InstrCost - 1;
+
   unsigned JumpTableSize = 0;
   BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
-  onFinalizeSwitch(JumpTableSize, NumCaseCluster);
+  // If suitable for a jump table, consider the cost for the table size and
+  // branch to destination.
+  if (JumpTableSize) {
+    int64_t JTCost = (int64_t)JumpTableSize * InlineConstants::InstrCost +
+                     4 * InlineConstants::InstrCost;
+
+    addCost(JTCost, (int64_t)CostUpperBound);
+    return false;
+  }
+
+  // Considering forming a binary search, we should find the number of nodes
+  // which is same as the number of comparisons when lowered. For a given
+  // number of clusters, n, we can define a recursive function, f(n), to find
+  // the number of nodes in the tree. The recursion is :
+  // f(n) = 1 + f(n/2) + f (n - n/2), when n > 3,
+  // and f(n) = n, when n <= 3.
+  // This will lead a binary tree where the leaf should be either f(2) or f(3)
+  // when n > 3.  So, the number of comparisons from leaves should be n, while
+  // the number of non-leaf should be :
+  //   2^(log2(n) - 1) - 1
+  //   = 2^log2(n) * 2^-1 - 1
+  //   = n / 2 - 1.
+  // Considering comparisons from leaf and non-leaf nodes, we can estimate the
+  // number of comparisons in a simple closed form :
+  //   n + n / 2 - 1 = n * 3 / 2 - 1
+  if (NumCaseCluster <= 3) {
+    // Suppose a comparison includes one compare and one conditional branch.
+    addCost(NumCaseCluster * 2 * InlineConstants::InstrCost);
+    return false;
+  }
+
+  int64_t ExpectedNumberOfCompare = 3 * (int64_t)NumCaseCluster / 2 - 1;
+  int64_t SwitchCost = ExpectedNumberOfCompare * 2 * InlineConstants::InstrCost;
+
+  addCost(SwitchCost, (int64_t)CostUpperBound);
   return false;
 }
 
@@ -1753,7 +1587,7 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      onCommonInstructionSimplification();
+      addCost(InlineConstants::InstrCost);
 
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
@@ -1798,7 +1632,9 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
       return IR;
     }
 
-    if (shouldStop())
+    // Check if we've passed the maximum possible threshold so we don't spin in
+    // huge basic blocks that will never inline.
+    if (Cost >= Threshold && !ComputeFullInlineCost)
       return false;
   }
 
@@ -1892,9 +1728,46 @@ void CallAnalyzer::findDeadBlocks(BasicBlock *CurrBB, BasicBlock *NextBB) {
 InlineResult CallAnalyzer::analyze() {
   ++NumCallsAnalyzed;
 
-  auto Result = onAnalysisStart();
-  if (!Result)
-    return Result;
+  // Perform some tweaks to the cost and threshold based on the direct
+  // callsite information.
+
+  // We want to more aggressively inline vector-dense kernels, so up the
+  // threshold, and we'll lower it if the % of vector instructions gets too
+  // low. Note that these bonuses are some what arbitrary and evolved over time
+  // by accident as much as because they are principled bonuses.
+  //
+  // FIXME: It would be nice to remove all such bonuses. At least it would be
+  // nice to base the bonus values on something more scientific.
+  assert(NumInstructions == 0);
+  assert(NumVectorInstructions == 0);
+
+  // Update the threshold based on callsite properties
+  updateThreshold(CandidateCall, F);
+
+  // While Threshold depends on commandline options that can take negative
+  // values, we want to enforce the invariant that the computed threshold and
+  // bonuses are non-negative.
+  assert(Threshold >= 0);
+  assert(SingleBBBonus >= 0);
+  assert(VectorBonus >= 0);
+
+  // Speculatively apply all possible bonuses to Threshold. If cost exceeds
+  // this Threshold any time, and cost cannot decrease, we can stop processing
+  // the rest of the function body.
+  Threshold += (SingleBBBonus + VectorBonus);
+
+  // Give out bonuses for the callsite, as the instructions setting them up
+  // will be gone after inlining.
+  addCost(-getCallsiteCost(CandidateCall, DL));
+
+  // If this function uses the coldcc calling convention, prefer not to inline
+  // it.
+  if (F.getCallingConv() == CallingConv::Cold)
+    Cost += InlineConstants::ColdccPenalty;
+
+  // Check if we're done. This can happen due to bonuses and penalties.
+  if (Cost >= Threshold && !ComputeFullInlineCost)
+    return "high cost";
 
   if (F.empty())
     return true;
@@ -1923,9 +1796,9 @@ InlineResult CallAnalyzer::analyze() {
       ConstantOffsetPtrs[&*FAI] = std::make_pair(PtrArg, C->getValue());
 
       // We can SROA any pointer arguments derived from alloca instructions.
-      if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
-        SROAArgValues[&*FAI] = SROAArg;
-        onInitializeSROAArg(SROAArg);
+      if (isa<AllocaInst>(PtrArg)) {
+        SROAArgValues[&*FAI] = PtrArg;
+        SROAArgCosts[PtrArg] = 0;
       }
     }
   }
@@ -1951,10 +1824,12 @@ InlineResult CallAnalyzer::analyze() {
       BBSetVector;
   BBSetVector BBWorklist;
   BBWorklist.insert(&F.getEntryBlock());
-
+  bool SingleBB = true;
   // Note that we *must not* cache the size, this loop grows the worklist.
   for (unsigned Idx = 0; Idx != BBWorklist.size(); ++Idx) {
-    if (shouldStop())
+    // Bail out the moment we cross the threshold. This means we'll under-count
+    // the cost, but only when undercounting doesn't matter.
+    if (Cost >= Threshold && !ComputeFullInlineCost)
       break;
 
     BasicBlock *BB = BBWorklist[Idx];
@@ -2014,7 +1889,15 @@ InlineResult CallAnalyzer::analyze() {
          ++TIdx)
       BBWorklist.insert(TI->getSuccessor(TIdx));
 
-    onBlockAnalyzed(BB);
+    // If we had any successors at this point, than post-inlining is likely to
+    // have them as well. Note that we assume any basic blocks which existed
+    // due to branches or switches which folded above will also fold after
+    // inlining.
+    if (SingleBB && TI->getNumSuccessors() > 1) {
+      // Take off the bonus we applied to the threshold.
+      Threshold -= SingleBBBonus;
+      SingleBB = false;
+    }
   }
 
   bool OnlyOneCallAndLocalLinkage = F.hasLocalLinkage() && F.hasOneUse() &&
@@ -2025,12 +1908,38 @@ InlineResult CallAnalyzer::analyze() {
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
     return "noduplicate";
 
-  return finalizeAnalysis();
+  // Loops generally act a lot like calls in that they act like barriers to
+  // movement, require a certain amount of setup, etc. So when optimising for
+  // size, we penalise any call sites that perform loops. We do this after all
+  // other costs here, so will likely only be dealing with relatively small
+  // functions (and hence DT and LI will hopefully be cheap).
+  if (Caller->hasMinSize()) {
+    DominatorTree DT(F);
+    LoopInfo LI(DT);
+    int NumLoops = 0;
+    for (Loop *L : LI) {
+      // Ignore loops that will not be executed
+      if (DeadBlocks.count(L->getHeader()))
+        continue;
+      NumLoops++;
+    }
+    addCost(NumLoops * InlineConstants::CallPenalty);
+  }
+
+  // We applied the maximum possible vector bonus at the beginning. Now,
+  // subtract the excess bonus, if any, from the Threshold before
+  // comparing against Cost.
+  if (NumVectorInstructions <= NumInstructions / 10)
+    Threshold -= VectorBonus;
+  else if (NumVectorInstructions <= NumInstructions / 2)
+    Threshold -= VectorBonus / 2;
+
+  return Cost < std::max(1, Threshold);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
+LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
   DEBUG_PRINT_STAT(NumConstantArgs);
   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
@@ -2164,8 +2073,8 @@ InlineCost llvm::getInlineCost(
   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
                           << "... (caller:" << Caller->getName() << ")\n");
 
-  InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
-                            *Callee, Call, Params);
+  CallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE, *Callee,
+                  Call, Params);
   InlineResult ShouldInline = CA.analyze();
 
   LLVM_DEBUG(CA.dump());
@@ -2212,16 +2121,15 @@ InlineResult llvm::isInlineViable(Function &F) {
         switch (Call->getCalledFunction()->getIntrinsicID()) {
         default:
           break;
-          // Disallow inlining of @llvm.icall.branch.funnel because current
-          // backend can't separate call targets from call arguments.
+        // Disallow inlining of @llvm.icall.branch.funnel because current
+        // backend can't separate call targets from call arguments.
         case llvm::Intrinsic::icall_branch_funnel:
           return "disallowed inlining of @llvm.icall.branch.funnel";
-          // Disallow inlining functions that call @llvm.localescape. Doing this
-          // correctly would require major changes to the inliner.
+        // Disallow inlining functions that call @llvm.localescape. Doing this
+        // correctly would require major changes to the inliner.
         case llvm::Intrinsic::localescape:
           return "disallowed inlining of @llvm.localescape";
-          // Disallow inlining of functions that initialize VarArgs with
-          // va_start.
+        // Disallow inlining of functions that initialize VarArgs with va_start.
         case llvm::Intrinsic::vastart:
           return "contains VarArgs initialized with va_start";
         }

From 26ba160d47220a0bce75b1f491bf6e262edf69fa Mon Sep 17 00:00:00 2001
From: Zheng Chen <czhengsz@cn.ibm.com>
Date: Wed, 8 Jan 2020 20:54:15 -0500
Subject: [PATCH 15/89] [PowerPC] when folding rlwinm+rlwinm. to andi., we
 should use first rlwinm input reg.

%2:gprc = RLWINM %1:gprc, 27, 5, 10
%3:gprc = RLWINM_rec %2:gprc, 8, 5, 10, implicit-def $cr0

==>

%3:gprc = ANDI_rec %1, 0, implicit-def $cr0

we should use %1 instead of %2 as ANDI_rec input.

Reviewed By: steven.zhang

Differential Revision: https://reviews.llvm.org/D71885
---
 llvm/lib/Target/PowerPC/PPCMIPeephole.cpp | 36 +++++++++++++----------
 llvm/test/CodeGen/PowerPC/fold-rlwinm.mir | 21 +++++++++++--
 2 files changed, 39 insertions(+), 18 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 1b67e1e55bf78..74192cb20cd05 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -897,6 +897,8 @@ bool PPCMIPeephole::simplifyCode(void) {
           bool Is64Bit = (MI.getOpcode() == PPC::RLWINM8 ||
                           MI.getOpcode() == PPC::RLWINM8_rec);
 
+          Simplified = true;
+
           LLVM_DEBUG(dbgs() << "Replace Instr: ");
           LLVM_DEBUG(MI.dump());
 
@@ -913,9 +915,14 @@ bool PPCMIPeephole::simplifyCode(void) {
             MI.RemoveOperand(3);
             MI.getOperand(2).setImm(0);
             MI.setDesc(TII->get(Is64Bit ? PPC::ANDI8_rec : PPC::ANDI_rec));
+            MI.getOperand(1).setReg(SrcMI->getOperand(1).getReg());
+            if (SrcMI->getOperand(1).isKill()) {
+              MI.getOperand(1).setIsKill(true);
+              SrcMI->getOperand(1).setIsKill(false);
+            } else
+              // About to replace MI.getOperand(1), clear its kill flag.
+              MI.getOperand(1).setIsKill(false);
           }
-          Simplified = true;
-          NumRotatesCollapsed++;
 
           LLVM_DEBUG(dbgs() << "With: ");
           LLVM_DEBUG(MI.dump());
@@ -925,16 +932,7 @@ bool PPCMIPeephole::simplifyCode(void) {
           // than NewME. Otherwise we get a 64 bit value after folding, but MI
           // return a 32 bit value.
 
-          // If FoldingReg has only one use and it it not RLWINM_rec and
-          // RLWINM8_rec, safe to delete its def SrcMI. Otherwise keep it.
-          if (MRI->hasOneNonDBGUse(FoldingReg) &&
-              (SrcMI->getOpcode() == PPC::RLWINM ||
-               SrcMI->getOpcode() == PPC::RLWINM8)) {
-            ToErase = SrcMI;
-            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
-            LLVM_DEBUG(SrcMI->dump());
-          }
-
+          Simplified = true;
           LLVM_DEBUG(dbgs() << "Converting Instr: ");
           LLVM_DEBUG(MI.dump());
 
@@ -953,12 +951,20 @@ bool PPCMIPeephole::simplifyCode(void) {
             // About to replace MI.getOperand(1), clear its kill flag.
             MI.getOperand(1).setIsKill(false);
 
-          Simplified = true;
-          NumRotatesCollapsed++;
-
           LLVM_DEBUG(dbgs() << "To: ");
           LLVM_DEBUG(MI.dump());
         }
+        if (Simplified) {
+          // If FoldingReg has no non-debug use and it has no implicit def (it
+          // is not RLWINMO or RLWINM8o), it's safe to delete its def SrcMI.
+          // Otherwise keep it.
+          ++NumRotatesCollapsed;
+          if (MRI->use_nodbg_empty(FoldingReg) && !SrcMI->hasImplicitDef()) {
+            ToErase = SrcMI;
+            LLVM_DEBUG(dbgs() << "Delete dead instruction: ");
+            LLVM_DEBUG(SrcMI->dump());
+          }
+        }
         break;
       }
       }
diff --git a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
index f2e576ed73b63..410f688204c31 100644
--- a/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
+++ b/llvm/test/CodeGen/PowerPC/fold-rlwinm.mir
@@ -118,7 +118,7 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
+    ; CHECK-NOT: RLWINM %1,
     %3:gprc = RLWINM %2:gprc, 8, 5, 10
     ; CHECK: %3:gprc = LI 0
     BLR8 implicit $lr8, implicit $rm
@@ -133,9 +133,24 @@ body: |
     %0:g8rc = COPY $x3
     %1:gprc = COPY %0.sub_32:g8rc
     %2:gprc = RLWINM %1:gprc, 27, 5, 10
-    ; CHECK: %2:gprc = RLWINM %1, 27, 5, 10
+    ; CHECK-NOT: RLWINM %1,
     %3:gprc = RLWINM_rec %2:gprc, 8, 5, 10, implicit-def $cr0
-    ; CHECK: %3:gprc = ANDI_rec %2, 0, implicit-def $cr0
+    ; CHECK: %3:gprc = ANDI_rec %1, 0, implicit-def $cr0
+    BLR8 implicit $lr8, implicit $rm
+...
+---
+name: testFoldRLWINMoToZeroSrcCanNotBeDeleted
+#CHECK : name : testFoldRLWINMoToZeroSrcCanNotBeDeleted
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $x3
+    %0:g8rc = COPY $x3
+    %1:gprc = COPY %0.sub_32:g8rc
+    %2:gprc = RLWINM_rec %1:gprc, 27, 5, 10, implicit-def $cr0
+    ; CHECK: %2:gprc = RLWINM_rec %1, 27, 5, 10, implicit-def $cr0
+    %3:gprc = RLWINM_rec %2:gprc, 8, 5, 10, implicit-def $cr0
+    ; CHECK: %3:gprc = ANDI_rec %1, 0, implicit-def $cr0
     BLR8 implicit $lr8, implicit $rm
 ...
 ---

From a63af915288ad9d1049d486833fcd085b620dc6d Mon Sep 17 00:00:00 2001
From: Alex Langford <apl@fb.com>
Date: Wed, 8 Jan 2020 15:35:21 -0800
Subject: [PATCH 16/89] [lldb] Remove various dead Compare functions

---
 lldb/source/Core/Section.cpp                  | 23 -----
 .../SymbolFile/DWARF/DWARFFormValue.cpp       | 95 -------------------
 lldb/source/Symbol/Function.cpp               | 10 --
 lldb/source/Symbol/Type.cpp                   | 11 ---
 4 files changed, 139 deletions(-)

diff --git a/lldb/source/Core/Section.cpp b/lldb/source/Core/Section.cpp
index 4a9acab2e27c1..b1d7eee108b7d 100644
--- a/lldb/source/Core/Section.cpp
+++ b/lldb/source/Core/Section.cpp
@@ -280,29 +280,6 @@ bool Section::ContainsFileAddress(addr_t vm_addr) const {
   return false;
 }
 
-int Section::Compare(const Section &a, const Section &b) {
-  if (&a == &b)
-    return 0;
-
-  const ModuleSP a_module_sp = a.GetModule();
-  const ModuleSP b_module_sp = b.GetModule();
-  if (a_module_sp == b_module_sp) {
-    user_id_t a_sect_uid = a.GetID();
-    user_id_t b_sect_uid = b.GetID();
-    if (a_sect_uid < b_sect_uid)
-      return -1;
-    if (a_sect_uid > b_sect_uid)
-      return 1;
-    return 0;
-  } else {
-    // The modules are different, just compare the module pointers
-    if (a_module_sp.get() < b_module_sp.get())
-      return -1;
-    else
-      return 1; // We already know the modules aren't equal
-  }
-}
-
 void Section::Dump(Stream *s, Target *target, uint32_t depth) const {
   //    s->Printf("%.*p: ", (int)sizeof(void*) * 2, this);
   s->Indent();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index 5cab4cef143e0..f660cc32b3f8d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -602,101 +602,6 @@ bool DWARFFormValue::IsDataForm(const dw_form_t form) {
   return false;
 }
 
-int DWARFFormValue::Compare(const DWARFFormValue &a_value,
-                            const DWARFFormValue &b_value) {
-  dw_form_t a_form = a_value.Form();
-  dw_form_t b_form = b_value.Form();
-  if (a_form < b_form)
-    return -1;
-  if (a_form > b_form)
-    return 1;
-  switch (a_form) {
-  case DW_FORM_addr:
-  case DW_FORM_addrx:
-  case DW_FORM_flag:
-  case DW_FORM_data1:
-  case DW_FORM_data2:
-  case DW_FORM_data4:
-  case DW_FORM_data8:
-  case DW_FORM_udata:
-  case DW_FORM_ref_addr:
-  case DW_FORM_sec_offset:
-  case DW_FORM_flag_present:
-  case DW_FORM_ref_sig8:
-  case DW_FORM_GNU_addr_index: {
-    uint64_t a = a_value.Unsigned();
-    uint64_t b = b_value.Unsigned();
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_sdata: {
-    int64_t a = a_value.Signed();
-    int64_t b = b_value.Signed();
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_string:
-  case DW_FORM_strp:
-  case DW_FORM_GNU_str_index: {
-    const char *a_string = a_value.AsCString();
-    const char *b_string = b_value.AsCString();
-    if (a_string == b_string)
-      return 0;
-    else if (a_string && b_string)
-      return strcmp(a_string, b_string);
-    else if (a_string == nullptr)
-      return -1; // A string is NULL, and B is valid
-    else
-      return 1; // A string valid, and B is NULL
-  }
-
-  case DW_FORM_block:
-  case DW_FORM_block1:
-  case DW_FORM_block2:
-  case DW_FORM_block4:
-  case DW_FORM_exprloc: {
-    uint64_t a_len = a_value.Unsigned();
-    uint64_t b_len = b_value.Unsigned();
-    if (a_len < b_len)
-      return -1;
-    if (a_len > b_len)
-      return 1;
-    // The block lengths are the same
-    return memcmp(a_value.BlockData(), b_value.BlockData(), a_value.Unsigned());
-  } break;
-
-  case DW_FORM_ref1:
-  case DW_FORM_ref2:
-  case DW_FORM_ref4:
-  case DW_FORM_ref8:
-  case DW_FORM_ref_udata: {
-    uint64_t a = a_value.m_value.value.uval;
-    uint64_t b = b_value.m_value.value.uval;
-    if (a < b)
-      return -1;
-    if (a > b)
-      return 1;
-    return 0;
-  }
-
-  case DW_FORM_indirect:
-    llvm_unreachable(
-        "This shouldn't happen after the form has been extracted...");
-
-  default:
-    llvm_unreachable("Unhandled DW_FORM");
-  }
-  return -1;
-}
-
 bool DWARFFormValue::FormIsSupported(dw_form_t form) {
   switch (form) {
     case DW_FORM_addr:
diff --git a/lldb/source/Symbol/Function.cpp b/lldb/source/Symbol/Function.cpp
index c392317df0066..e92585ccfed70 100644
--- a/lldb/source/Symbol/Function.cpp
+++ b/lldb/source/Symbol/Function.cpp
@@ -76,16 +76,6 @@ InlineFunctionInfo::InlineFunctionInfo(ConstString name,
 
 InlineFunctionInfo::~InlineFunctionInfo() {}
 
-int InlineFunctionInfo::Compare(const InlineFunctionInfo &a,
-                                const InlineFunctionInfo &b) {
-
-  int result = FunctionInfo::Compare(a, b);
-  if (result)
-    return result;
-  // only compare the mangled names if both have them
-  return Mangled::Compare(a.m_mangled, a.m_mangled);
-}
-
 void InlineFunctionInfo::Dump(Stream *s, bool show_fullpaths) const {
   FunctionInfo::Dump(s, show_fullpaths);
   if (m_mangled)
diff --git a/lldb/source/Symbol/Type.cpp b/lldb/source/Symbol/Type.cpp
index aeca76f7d05f8..f194356a0a079 100644
--- a/lldb/source/Symbol/Type.cpp
+++ b/lldb/source/Symbol/Type.cpp
@@ -656,17 +656,6 @@ CompilerType Type::GetForwardCompilerType() {
   return m_compiler_type;
 }
 
-int Type::Compare(const Type &a, const Type &b) {
-  // Just compare the UID values for now...
-  lldb::user_id_t a_uid = a.GetID();
-  lldb::user_id_t b_uid = b.GetID();
-  if (a_uid < b_uid)
-    return -1;
-  if (a_uid > b_uid)
-    return 1;
-  return 0;
-}
-
 ConstString Type::GetQualifiedName() {
   return GetForwardCompilerType().GetConstTypeName();
 }

From be841f89d0014b1e0246a4feae941b2f74abd908 Mon Sep 17 00:00:00 2001
From: Peng Guo <peng_guo@apple.com>
Date: Wed, 8 Jan 2020 17:13:24 -0800
Subject: [PATCH 17/89] [MIR] Target specific MIR formating and parsing

Summary:
Added MIRFormatter for target specific MIR formating and parsing with
immediate and custom pseudo source values. Target machine can subclass
MIRFormatter and implement custom logic for printing and parsing
immediate and custom pseudo source values for better readability.

* Target specific immediate mnemonic need to start with "." follows by
  identifier string. When MIR parser sees immediate it will call target
  specific parsing function.

* Custom pseudo source value need to start with custom follows by
  double-quoted string. MIR parser will pass the quoted string to target
  specific PSV parsing function.

* MIRFormatter have 2 helper functions to facilitate LLVM value printing
  and parsing for custom PSV if they refers LLVM values.

Reviewers: dsanders, arsenm

Reviewed By: dsanders

Subscribers: wdng, jvesely, nhaehnle, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D69836
---
 llvm/include/llvm/CodeGen/MIRFormatter.h      |  83 ++++++
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   4 +
 llvm/include/llvm/CodeGen/MachineMemOperand.h |   4 +-
 llvm/include/llvm/CodeGen/MachineOperand.h    |   8 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h |   2 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   2 +
 llvm/include/llvm/Target/TargetMachine.h      |   6 +
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |   1 +
 llvm/lib/CodeGen/MIRParser/MILexer.h          |   1 +
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 265 +++++++++++++-----
 llvm/lib/CodeGen/MIRPrinter.cpp               |  27 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  19 +-
 llvm/lib/CodeGen/MachineOperand.cpp           |  57 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +-
 llvm/lib/Target/TargetMachine.cpp             |   5 +-
 ...llvm.amdgcn.raw.buffer.store.format.f16.ll |  44 +--
 ...llvm.amdgcn.raw.buffer.store.format.f32.ll |  24 +-
 .../llvm.amdgcn.raw.buffer.store.ll           |  66 ++---
 .../regbankselect-amdgcn.image.load.1d.ll     |   8 +-
 .../regbankselect-amdgcn.image.sample.1d.ll   |  10 +-
 .../regbankselect-amdgcn.raw.buffer.load.ll   |  10 +-
 ...regbankselect-amdgcn.struct.buffer.load.ll |  10 +-
 ...egbankselect-amdgcn.struct.buffer.store.ll |  10 +-
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 200 ++++++-------
 .../AMDGPU/extract_subvector_vec4_vec3.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp |   2 +-
 27 files changed, 561 insertions(+), 315 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/MIRFormatter.h

diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
new file mode 100644
index 0000000000000..e57c32c5ae614
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -0,0 +1,83 @@
+//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MIRFormatter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRFORMATTER_H
+#define LLVM_CODEGEN_MIRFORMATTER_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+/// MIRFormater - Interface to format MIR operand based on target
+class MIRFormatter {
+public:
+  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+      ErrorCallbackType;
+
+  MIRFormatter() {}
+  virtual ~MIRFormatter() = default;
+
+  /// Implement target specific printing for machine operand immediate value, so
+  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+  /// None to OpIdx means the index is unknown.
+  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+                        Optional<unsigned> OpIdx, int64_t Imm) const {
+    OS << Imm;
+  }
+
+  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+  /// dot seperated strings.
+  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                                StringRef Src, int64_t &Imm,
+                                ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
+  }
+
+  /// Implement target specific printing of target custom pseudo source value.
+  /// Default implementation is not necessarily the correct MIR serialization
+  /// format.
+  virtual void
+  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
+                               const PseudoSourceValue &PSV) const {
+    PSV.printCustom(OS);
+  }
+
+  /// Implement target specific parsing of target custom pseudo source value.
+  virtual bool parseCustomPseudoSourceValue(
+      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable(
+        "target did not implement parsing MIR custom pseudo source value");
+  }
+
+  /// Helper functions to print IR value as MIR serialization format which will
+  /// be useful for target specific printer, e.g. for printing IR value in
+  /// custom pseudo source value.
+  static void printIRValue(raw_ostream &OS, const Value &V,
+                           ModuleSlotTracker &MST);
+
+  /// Helper functions to parse IR value from MIR serialization format which
+  /// will be useful for target specific parser, e.g. for parsing IR value for
+  /// custom pseudo source value.
+  static bool parseIRValue(StringRef Src, MachineFunction &MF,
+                           PerFunctionMIParsingState &PFS, const Value *&V,
+                           ErrorCallbackType ErrorCallback);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4e32a04551c1c..8ca665b23b280 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,12 +171,16 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
+  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 7ee700c62a25c..b0243646b06c7 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
+class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -295,7 +296,8 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
+             const MIRFormatter *MIRF) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index df914dc2d85e8..4222c03b023a9 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,6 +278,9 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
+  /// \param OpIdx - specifies the index of the operand in machine instruction.
+  /// This will be used by target dependent MIR formatter. Could be None if the
+  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -294,8 +297,9 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
-             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
+             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
+             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
+             const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 4b3cc9145a134..593a865ea5458 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,6 +22,7 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
+class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -52,6 +53,7 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
+  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 4b4cea30b2ba3..e410d1c4806d3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1807,6 +1808,7 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
+  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 176ae39b17a7c..39422ac3bf8ca 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,6 +33,7 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
+class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -94,6 +95,7 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
+  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -197,6 +199,10 @@ class TargetMachine {
     return nullptr;
   }
 
+  /// Return MIR formatter to format/parse MIR operands.  Target can override
+  /// this virtual function and return target specific MIR formatter.
+  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
+
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 21511586ff185..0c35a91f8282e 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,6 +242,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
+      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 1e2eba91ceb53..af5327cacfea5 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,6 +110,7 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
+    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 525c70016a0fb..0f2648e2bfac5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,6 +28,7 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -343,6 +344,37 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -370,8 +402,6 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -455,9 +485,12 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(MachineOperand &Dest,
+  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                           MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
+                                         const unsigned OpIdx,
+                                         MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -473,6 +506,9 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
+  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                              MachineOperand &Dest, const MIRFormatter &MF);
+
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -551,6 +587,9 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
+typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -912,7 +951,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1493,17 +1532,61 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
+bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
+                                      const unsigned OpIdx,
+                                      MachineOperand &Dest,
+                                      const MIRFormatter &MF) {
+  assert(Token.is(MIToken::dot));
+  auto Loc = Token.location(); // record start position
+  size_t Len = 1;              // for "."
+  lex();
+
+  // Handle the case that mnemonic starts with number.
+  if (Token.is(MIToken::IntegerLiteral)) {
+    Len += Token.range().size();
+    lex();
+  }
+
+  StringRef Src;
+  if (Token.is(MIToken::comma))
+    Src = StringRef(Loc, Len);
+  else {
+    assert(Token.is(MIToken::Identifier));
+    Src = StringRef(Loc, Len + Token.stringValue().size());
+  }
+  int64_t Val;
+  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
+                          [this](StringRef::iterator Loc, const Twine &Msg)
+                              -> bool { return error(Loc, Msg); }))
+    return true;
+
+  Dest = MachineOperand::CreateImm(Val);
+  if (!Token.is(MIToken::comma))
+    lex();
+  return false;
+}
+
+static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                            PerFunctionMIParsingState &PFS, const Constant *&C,
+                            ErrorCallbackType ErrCB) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return error(Loc + Err.getColumnNo(), Err.getMessage());
+    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  return ::parseIRConstant(
+      Loc, StringValue, PFS, C,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1636,27 +1719,52 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
+static bool getHexUint(const MIToken &Token, APInt &Result) {
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
+}
+
+bool getUnsigned(const MIToken &Token, unsigned &Result,
+                 ErrorCallbackType ErrCB) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(A))
+    if (getHexUint(Token, A))
       return true;
     if (A.getBitWidth() > 32)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
+bool MIParser::getUnsigned(unsigned &Result) {
+  return ::getUnsigned(
+      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1736,23 +1844,25 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+static bool parseGlobalValue(const MIToken &Token,
+                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
+                             ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = MF.getFunction().getParent();
+    const Module *M = PFS.MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return error(Twine("use of undefined global value '") + Token.range() +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '") +
+                                         Token.range() + "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(GVIdx))
+    if (getUnsigned(Token, GVIdx, ErrCB))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
+                                         Twine(GVIdx) + "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1762,6 +1872,14 @@ bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   return false;
 }
 
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  return ::parseGlobalValue(
+      Token, PFS, GV,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2410,7 +2528,8 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(MachineOperand &Dest,
+bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                                   MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2499,6 +2618,12 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
+  case MIToken::dot: {
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
+    }
+    LLVM_FALLTHROUGH;
+  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2507,7 +2632,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
+    Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2539,7 +2665,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(Dest, TiedDefIdx))
+  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2600,30 +2726,31 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
+static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
+                         const Value *&V, ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(SlotNumber))
+    if (getUnsigned(Token, SlotNumber, ErrCB))
       return true;
-    V = getIRValue(SlotNumber);
+    V = PFS.getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(GV))
+    if (parseGlobalValue(Token, PFS, GV, ErrCB))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
       return true;
     V = C;
     break;
@@ -2632,10 +2759,17 @@ bool MIParser::parseIRValue(const Value *&V) {
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
+bool MIParser::parseIRValue(const Value *&V) {
+  return ::parseIRValue(
+      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2656,20 +2790,7 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
+  return ::getHexUint(Token, Result);
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2756,6 +2877,19 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
+  case MIToken::kw_custom: {
+    lex();
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      if (Formatter->parseCustomPseudoSourceValue(
+              Token.stringValue(), MF, PFS, PSV,
+              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+                return error(Loc, Msg);
+              }))
+        return true;
+    } else
+      return error("unable to parse target custom pseudo source value");
+    break;
+  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2767,7 +2901,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry)) {
+      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3018,35 +3152,8 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
 const Value *MIParser::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
+  return PFS.getIRValue(Slot);
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3111,3 +3218,15 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
+
+bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS, const Value *&V,
+                                ErrorCallbackType ErrorCallback) {
+  MIToken Token;
+  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
+    ErrorCallback(Loc, Msg);
+  });
+  V = nullptr;
+
+  return ::parseIRValue(Token, PFS, V, ErrorCallback);
+}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b06e34a809fca..9d9c12a95918c 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,6 +709,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
+  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -807,7 +808,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
       NeedComma = true;
     }
   }
@@ -856,7 +857,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -874,6 +875,28 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
+void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
+                                ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 16ae732169a48..177fef80e2e68 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,13 +89,15 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII) {
+                               const TargetInstrInfo *&TII,
+                               const MIRFormatter *&MIRF) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
+    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1477,7 +1479,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
+  const MIRFormatter *MIRF = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1506,7 +1509,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1561,7 +1564,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1600,7 +1603,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1611,7 +1614,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1678,7 +1681,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1737,7 +1740,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 8b19501ec3cf1..5dd98467ba663 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -458,28 +459,6 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
-static void printIRValueReference(raw_ostream &OS, const Value &V,
-                                  ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -734,14 +713,15 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
+        /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
-                           bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
+                           bool IsStandalone, bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -802,9 +782,16 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate:
-    OS << getImm();
+  case MachineOperand::MO_Immediate: {
+    const MIRFormatter *Formatter = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      Formatter = MF->getTarget().getMIRFormatter();
+    if (Formatter)
+      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
+    else
+      OS << getImm();
     break;
+  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1070,7 +1057,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII) const {
+                              const TargetInstrInfo *TII,
+                              const MIRFormatter* MIRF) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1111,7 +1099,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    printIRValueReference(OS, *Val, MST);
+    MIRFormatter::printIRValue(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1144,15 +1132,20 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default:
+    default: {
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom ";
-      PVal->printCustom(OS);
+      OS << "custom \"";
+      if (MIRF)
+        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
+      else
+        PVal->printCustom(OS);
+      OS << '\"';
       break;
     }
+    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6fd71393bf38c..a12d708c2dd4d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,7 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII, MF->getTarget().getMIRFormatter());
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 97a1eb2f190a9..f070b143d5b4e 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -37,7 +38,9 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {}
+      DefaultOptions(Options), Options(Options) {
+  MIRF = std::make_unique<MIRFormatter>();
+}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index dd4f892ebc231..ce71a89adacb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index 75d25b0c2c469..aea37fd08b408 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 4db5fe081fda2..c5aa36df8675d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 44a17012237ec..d4a3f4025b378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index c59372a8d09c7..e5d67a3f88742 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 33a8e9a1284cc..4443daba2ee2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 9bc81aecc8a1d..a657488278b04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index efe81eabc3497..a6ba559382f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index f96a13878ba6a..ce62e041aa67e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 0c264251942a4..93322c7da4f86 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index cdcf7383afc09..068e3d98f17be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index faa471f2260c7..7e60fab281545 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");

From d44a0743175fc781aa2a23d638d683ded6eb10b8 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Wed, 8 Jan 2020 18:44:50 -0800
Subject: [PATCH 18/89] Save more descriptive error msg from FBS/BKS, relay it
 up to lldb.

When lldb requests an app launch through FrontBoard/BackBoard,
we get back an NSError object if there was a problem with an
integer error code and a descriptive text string.  debugserver
would log the descriptive text string to the console, but it
would only save the error code value, ask for the
much-less-specific name of that error code, and send that very
generic error word back to lldb.

This patch saves the longer description of the failure when
available, and sends that to lldb.  If unavailable, it falls
back to sending up the generic description of the error code
as it was doing before.

This only impacts the iOS on-device debugserver.

<rdar://problem/49953304>
---
 .../debugserver/source/MacOSX/MachProcess.mm  | 46 +++++++++++--------
 1 file changed, 26 insertions(+), 20 deletions(-)

diff --git a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
index 2e952d6ad0bc7..40facdfb5cf9b 100644
--- a/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
+++ b/lldb/tools/debugserver/source/MacOSX/MachProcess.mm
@@ -86,7 +86,7 @@ static CFStringRef CopyBundleIDForPath(const char *app_bundle_path,
 #if defined(WITH_BKS) || defined(WITH_FBS)
 #import <Foundation/Foundation.h>
 static const int OPEN_APPLICATION_TIMEOUT_ERROR = 111;
-typedef void (*SetErrorFunction)(NSInteger, DNBError &);
+typedef void (*SetErrorFunction)(NSInteger, std::string, DNBError &);
 typedef bool (*CallOpenApplicationFunction)(NSString *bundleIDNSStr,
                                             NSDictionary *options,
                                             DNBError &error, pid_t *return_pid);
@@ -122,6 +122,7 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
   mach_port_t client_port = [system_service createClientPort];
   __block dispatch_semaphore_t semaphore = dispatch_semaphore_create(0);
   __block ErrorFlavor open_app_error = no_error_enum_value;
+  __block std::string open_app_error_string;
   bool wants_pid = (return_pid != NULL);
   __block pid_t pid_in_block;
 
@@ -159,6 +160,9 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
              } else {
                const char *error_str =
                    [(NSString *)[bks_error localizedDescription] UTF8String];
+               if (error_str) {
+                 open_app_error_string = error_str;
+               }
                DNBLogThreadedIf(LOG_PROCESS, "In completion handler for send "
                                              "event, got error \"%s\"(%ld).",
                                 error_str ? error_str : "<unknown error>",
@@ -190,7 +194,7 @@ static bool CallBoardSystemServiceOpenApplication(NSString *bundleIDNSStr,
     error.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     error.SetErrorString("timed out trying to launch app");
   } else if (open_app_error != no_error_enum_value) {
-    error_function(open_app_error, error);
+    error_function(open_app_error, open_app_error_string, error);
     DNBLogError("unable to launch the application with CFBundleIdentifier '%s' "
                 "bks_error = %u",
                 cstr, open_app_error);
@@ -245,19 +249,19 @@ static bool IsBKSProcess(nub_process_t pid) {
   return app_state != BKSApplicationStateUnknown;
 }
 
-static void SetBKSError(NSInteger error_code, DNBError &error) {
+static void SetBKSError(NSInteger error_code, 
+                        std::string error_description, 
+                        DNBError &error) {
   error.SetError(error_code, DNBError::BackBoard);
   NSString *err_nsstr = ::BKSOpenApplicationErrorCodeToString(
       (BKSOpenApplicationErrorCode)error_code);
-  const char *err_str = NULL;
-  if (err_nsstr == NULL)
-    err_str = "unknown BKS error";
-  else {
+  std::string err_str = "unknown BKS error";
+  if (error_description.empty() == false) {
+    err_str = error_description;
+  } else if (err_nsstr != nullptr) {
     err_str = [err_nsstr UTF8String];
-    if (err_str == NULL)
-      err_str = "unknown BKS error";
   }
-  error.SetErrorString(err_str);
+  error.SetErrorString(err_str.c_str());
 }
 
 static bool BKSAddEventDataToOptions(NSMutableDictionary *options,
@@ -355,19 +359,19 @@ static bool IsFBSProcess(nub_process_t pid) {
 }
 #endif
 
-static void SetFBSError(NSInteger error_code, DNBError &error) {
+static void SetFBSError(NSInteger error_code, 
+                        std::string error_description, 
+                        DNBError &error) {
   error.SetError((DNBError::ValueType)error_code, DNBError::FrontBoard);
   NSString *err_nsstr = ::FBSOpenApplicationErrorCodeToString(
       (FBSOpenApplicationErrorCode)error_code);
-  const char *err_str = NULL;
-  if (err_nsstr == NULL)
-    err_str = "unknown FBS error";
-  else {
+  std::string err_str = "unknown FBS error";
+  if (error_description.empty() == false) {
+    err_str = error_description;
+  } else if (err_nsstr != nullptr) {
     err_str = [err_nsstr UTF8String];
-    if (err_str == NULL)
-      err_str = "unknown FBS error";
   }
-  error.SetErrorString(err_str);
+  error.SetErrorString(err_str.c_str());
 }
 
 static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
@@ -2754,7 +2758,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
           "debugserver timed out waiting for openApplication to complete.");
       attach_err.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     } else if (attach_error_code != FBSOpenApplicationErrorCodeNone) {
-      SetFBSError(attach_error_code, attach_err);
+      std::string empty_str;
+      SetFBSError(attach_error_code, empty_str, attach_err);
       DNBLogError("unable to launch the application with CFBundleIdentifier "
                   "'%s' bks_error = %ld",
                   bundleIDStr.c_str(), (NSInteger)attach_error_code);
@@ -2831,7 +2836,8 @@ static bool FBSAddEventDataToOptions(NSMutableDictionary *options,
           "debugserver timed out waiting for openApplication to complete.");
       attach_err.SetError(OPEN_APPLICATION_TIMEOUT_ERROR, DNBError::Generic);
     } else if (attach_error_code != BKSOpenApplicationErrorCodeNone) {
-      SetBKSError(attach_error_code, attach_err);
+      std::string empty_str;
+      SetBKSError(attach_error_code, empty_str, attach_err);
       DNBLogError("unable to launch the application with CFBundleIdentifier "
                   "'%s' bks_error = %ld",
                   bundleIDStr.c_str(), attach_error_code);

From 5ab6fa7b7011b00e8955168c5bbcb46e9712afa9 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 8 Jan 2020 18:44:12 -0800
Subject: [PATCH 19/89] Revert "[MIR] Target specific MIR formating and
 parsing"

Forgot to credit Peng in the commit message.

This reverts commit be841f89d0014b1e0246a4feae941b2f74abd908.
---
 llvm/include/llvm/CodeGen/MIRFormatter.h      |  83 ------
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   4 -
 llvm/include/llvm/CodeGen/MachineMemOperand.h |   4 +-
 llvm/include/llvm/CodeGen/MachineOperand.h    |   8 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h |   2 -
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   2 -
 llvm/include/llvm/Target/TargetMachine.h      |   6 -
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |   1 -
 llvm/lib/CodeGen/MIRParser/MILexer.h          |   1 -
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 265 +++++-------------
 llvm/lib/CodeGen/MIRPrinter.cpp               |  27 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  19 +-
 llvm/lib/CodeGen/MachineOperand.cpp           |  57 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +-
 llvm/lib/Target/TargetMachine.cpp             |   5 +-
 ...llvm.amdgcn.raw.buffer.store.format.f16.ll |  44 +--
 ...llvm.amdgcn.raw.buffer.store.format.f32.ll |  24 +-
 .../llvm.amdgcn.raw.buffer.store.ll           |  66 ++---
 .../regbankselect-amdgcn.image.load.1d.ll     |   8 +-
 .../regbankselect-amdgcn.image.sample.1d.ll   |  10 +-
 .../regbankselect-amdgcn.raw.buffer.load.ll   |  10 +-
 ...regbankselect-amdgcn.struct.buffer.load.ll |  10 +-
 ...egbankselect-amdgcn.struct.buffer.store.ll |  10 +-
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 200 ++++++-------
 .../AMDGPU/extract_subvector_vec4_vec3.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp |   2 +-
 27 files changed, 315 insertions(+), 561 deletions(-)
 delete mode 100644 llvm/include/llvm/CodeGen/MIRFormatter.h

diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
deleted file mode 100644
index e57c32c5ae614..0000000000000
--- a/llvm/include/llvm/CodeGen/MIRFormatter.h
+++ /dev/null
@@ -1,83 +0,0 @@
-//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MIRFormatter class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_MIRFORMATTER_H
-#define LLVM_CODEGEN_MIRFORMATTER_H
-
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-
-namespace llvm {
-
-struct PerFunctionMIParsingState;
-struct SlotMapping;
-
-/// MIRFormater - Interface to format MIR operand based on target
-class MIRFormatter {
-public:
-  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
-      ErrorCallbackType;
-
-  MIRFormatter() {}
-  virtual ~MIRFormatter() = default;
-
-  /// Implement target specific printing for machine operand immediate value, so
-  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
-  /// None to OpIdx means the index is unknown.
-  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
-                        Optional<unsigned> OpIdx, int64_t Imm) const {
-    OS << Imm;
-  }
-
-  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
-  /// dot seperated strings.
-  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                                StringRef Src, int64_t &Imm,
-                                ErrorCallbackType ErrorCallback) const {
-    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
-  }
-
-  /// Implement target specific printing of target custom pseudo source value.
-  /// Default implementation is not necessarily the correct MIR serialization
-  /// format.
-  virtual void
-  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
-                               const PseudoSourceValue &PSV) const {
-    PSV.printCustom(OS);
-  }
-
-  /// Implement target specific parsing of target custom pseudo source value.
-  virtual bool parseCustomPseudoSourceValue(
-      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
-      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
-    llvm_unreachable(
-        "target did not implement parsing MIR custom pseudo source value");
-  }
-
-  /// Helper functions to print IR value as MIR serialization format which will
-  /// be useful for target specific printer, e.g. for printing IR value in
-  /// custom pseudo source value.
-  static void printIRValue(raw_ostream &OS, const Value &V,
-                           ModuleSlotTracker &MST);
-
-  /// Helper functions to parse IR value from MIR serialization format which
-  /// will be useful for target specific parser, e.g. for parsing IR value for
-  /// custom pseudo source value.
-  static bool parseIRValue(StringRef Src, MachineFunction &MF,
-                           PerFunctionMIParsingState &PFS, const Value *&V,
-                           ErrorCallbackType ErrorCallback);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 8ca665b23b280..4e32a04551c1c 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,16 +171,12 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
-
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
-  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index b0243646b06c7..7ee700c62a25c 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,7 +26,6 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
-class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -296,8 +295,7 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
-             const MIRFormatter *MIRF) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index 4222c03b023a9..df914dc2d85e8 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,9 +278,6 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
-  /// \param OpIdx - specifies the index of the operand in machine instruction.
-  /// This will be used by target dependent MIR formatter. Could be None if the
-  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -297,9 +294,8 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
-             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
-             const TargetRegisterInfo *TRI,
+             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
+             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 593a865ea5458..4b3cc9145a134 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,7 +22,6 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
-class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -53,7 +52,6 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
-  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e410d1c4806d3..4b4cea30b2ba3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1808,7 +1807,6 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
-  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 39422ac3bf8ca..176ae39b17a7c 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,7 +33,6 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
-class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -95,7 +94,6 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
-  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -199,10 +197,6 @@ class TargetMachine {
     return nullptr;
   }
 
-  /// Return MIR formatter to format/parse MIR operands.  Target can override
-  /// this virtual function and return target specific MIR formatter.
-  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
-
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 0c35a91f8282e..21511586ff185 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,7 +242,6 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
-      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index af5327cacfea5..1e2eba91ceb53 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,7 +110,6 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
-    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 0f2648e2bfac5..525c70016a0fb 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,7 +28,6 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -344,37 +343,6 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
-const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
-}
-
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -402,6 +370,8 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -485,12 +455,9 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
-                           MachineOperand &Dest,
+  bool parseMachineOperand(MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
-                                         const unsigned OpIdx,
-                                         MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -506,9 +473,6 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
-  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                              MachineOperand &Dest, const MIRFormatter &MF);
-
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -587,9 +551,6 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
-typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
-    ErrorCallbackType;
-
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -951,7 +912,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1532,61 +1493,17 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
-                                      const unsigned OpIdx,
-                                      MachineOperand &Dest,
-                                      const MIRFormatter &MF) {
-  assert(Token.is(MIToken::dot));
-  auto Loc = Token.location(); // record start position
-  size_t Len = 1;              // for "."
-  lex();
-
-  // Handle the case that mnemonic starts with number.
-  if (Token.is(MIToken::IntegerLiteral)) {
-    Len += Token.range().size();
-    lex();
-  }
-
-  StringRef Src;
-  if (Token.is(MIToken::comma))
-    Src = StringRef(Loc, Len);
-  else {
-    assert(Token.is(MIToken::Identifier));
-    Src = StringRef(Loc, Len + Token.stringValue().size());
-  }
-  int64_t Val;
-  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
-                          [this](StringRef::iterator Loc, const Twine &Msg)
-                              -> bool { return error(Loc, Msg); }))
-    return true;
-
-  Dest = MachineOperand::CreateImm(Val);
-  if (!Token.is(MIToken::comma))
-    lex();
-  return false;
-}
-
-static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                            PerFunctionMIParsingState &PFS, const Constant *&C,
-                            ErrorCallbackType ErrCB) {
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
-  return ::parseIRConstant(
-      Loc, StringValue, PFS, C,
-      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1719,52 +1636,27 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-static bool getHexUint(const MIToken &Token, APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
-}
-
-bool getUnsigned(const MIToken &Token, unsigned &Result,
-                 ErrorCallbackType ErrCB) {
+bool MIParser::getUnsigned(unsigned &Result) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+      return error("expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(Token, A))
+    if (getHexUint(A))
       return true;
     if (A.getBitWidth() > 32)
-      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+      return error("expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
-  return ::getUnsigned(
-      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1844,25 +1736,23 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-static bool parseGlobalValue(const MIToken &Token,
-                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
-                             ErrorCallbackType ErrCB) {
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = PFS.MF.getFunction().getParent();
+    const Module *M = MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return ErrCB(Token.location(), Twine("use of undefined global value '") +
-                                         Token.range() + "'");
+      return error(Twine("use of undefined global value '") + Token.range() +
+                   "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(Token, GVIdx, ErrCB))
+    if (getUnsigned(GVIdx))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
-                                         Twine(GVIdx) + "'");
+      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
+                   "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1872,14 +1762,6 @@ static bool parseGlobalValue(const MIToken &Token,
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
-  return ::parseGlobalValue(
-      Token, PFS, GV,
-      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2528,8 +2410,7 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
-                                   MachineOperand &Dest,
+bool MIParser::parseMachineOperand(MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2618,12 +2499,6 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
-  case MIToken::dot: {
-    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
-      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
-    }
-    LLVM_FALLTHROUGH;
-  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2632,8 +2507,7 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
-    Optional<unsigned> &TiedDefIdx) {
+    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2665,7 +2539,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
+  if (parseMachineOperand(Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2726,31 +2600,30 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
-                         const Value *&V, ErrorCallbackType ErrCB) {
+bool MIParser::parseIRValue(const Value *&V) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(Token, SlotNumber, ErrCB))
+    if (getUnsigned(SlotNumber))
       return true;
-    V = PFS.getIRValue(SlotNumber);
+    V = getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(Token, PFS, GV, ErrCB))
+    if (parseGlobalValue(GV))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
+    if (parseIRConstant(Token.location(), Token.stringValue(), C))
       return true;
     V = C;
     break;
@@ -2759,17 +2632,10 @@ static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
+    return error(Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
-  return ::parseIRValue(
-      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2790,7 +2656,20 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  return ::getHexUint(Token, Result);
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2877,19 +2756,6 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
-  case MIToken::kw_custom: {
-    lex();
-    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
-      if (Formatter->parseCustomPseudoSourceValue(
-              Token.stringValue(), MF, PFS, PSV,
-              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-                return error(Loc, Msg);
-              }))
-        return true;
-    } else
-      return error("unable to parse target custom pseudo source value");
-    break;
-  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2901,7 +2767,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
+      Token.is(MIToken::kw_call_entry)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3152,8 +3018,35 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
 const Value *MIParser::getIRValue(unsigned Slot) {
-  return PFS.getIRValue(Slot);
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3218,15 +3111,3 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
-
-bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
-                                PerFunctionMIParsingState &PFS, const Value *&V,
-                                ErrorCallbackType ErrorCallback) {
-  MIToken Token;
-  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
-    ErrorCallback(Loc, Msg);
-  });
-  V = nullptr;
-
-  return ::parseIRValue(Token, PFS, V, ErrorCallback);
-}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 9d9c12a95918c..b06e34a809fca 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,7 +709,6 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
-  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -808,7 +807,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII);
       NeedComma = true;
     }
   }
@@ -857,7 +856,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -875,28 +874,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
-void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
-                                ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 177fef80e2e68..16ae732169a48 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,15 +89,13 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII,
-                               const MIRFormatter *&MIRF) {
+                               const TargetInstrInfo *&TII) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
-    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1479,8 +1477,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  const MIRFormatter *MIRF = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1509,7 +1506,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1564,7 +1561,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1603,7 +1600,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1614,7 +1611,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1681,7 +1678,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1740,7 +1737,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 5dd98467ba663..8b19501ec3cf1 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -459,6 +458,28 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
+static void printIRValueReference(raw_ostream &OS, const Value &V,
+                                  ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -713,15 +734,14 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
-        /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
-                           bool IsStandalone, bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
+                           bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -782,16 +802,9 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate: {
-    const MIRFormatter *Formatter = nullptr;
-    if (const MachineFunction *MF = getMFIfAvailable(*this))
-      Formatter = MF->getTarget().getMIRFormatter();
-    if (Formatter)
-      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
-    else
-      OS << getImm();
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
     break;
-  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1057,8 +1070,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII,
-                              const MIRFormatter* MIRF) const {
+                              const TargetInstrInfo *TII) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1099,7 +1111,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    MIRFormatter::printIRValue(OS, *Val, MST);
+    printIRValueReference(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1132,20 +1144,15 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default: {
+    default:
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom \"";
-      if (MIRF)
-        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
-      else
-        PVal->printCustom(OS);
-      OS << '\"';
+      OS << "custom ";
+      PVal->printCustom(OS);
       break;
     }
-    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a12d708c2dd4d..6fd71393bf38c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,7 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII, MF->getTarget().getMIRFormatter());
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index f070b143d5b4e..97a1eb2f190a9 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -38,9 +37,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {
-  MIRF = std::make_unique<MIRFormatter>();
-}
+      DefaultOptions(Options), Options(Options) {}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index ce71a89adacb7..dd4f892ebc231 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index aea37fd08b408..75d25b0c2c469 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index c5aa36df8675d..4db5fe081fda2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index d4a3f4025b378..44a17012237ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index e5d67a3f88742..c59372a8d09c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 4443daba2ee2d..33a8e9a1284cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index a657488278b04..9bc81aecc8a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index a6ba559382f5c..efe81eabc3497 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index ce62e041aa67e..f96a13878ba6a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 93322c7da4f86..0c264251942a4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 068e3d98f17be..cdcf7383afc09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 7e60fab281545..faa471f2260c7 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");

From 3ef05d85be8c3666ebfa3ad986eb334da5195a47 Mon Sep 17 00:00:00 2001
From: Peng Guo <peng_guo@apple.com>
Date: Wed, 8 Jan 2020 17:13:24 -0800
Subject: [PATCH 20/89] [MIR] Target specific MIR formating and parsing

Summary:
Added MIRFormatter for target specific MIR formating and parsing with
immediate and custom pseudo source values. Target machine can subclass
MIRFormatter and implement custom logic for printing and parsing
immediate and custom pseudo source values for better readability.

* Target specific immediate mnemonic need to start with "." follows by
  identifier string. When MIR parser sees immediate it will call target
  specific parsing function.

* Custom pseudo source value need to start with custom follows by
  double-quoted string. MIR parser will pass the quoted string to target
  specific PSV parsing function.

* MIRFormatter have 2 helper functions to facilitate LLVM value printing
  and parsing for custom PSV if they refers LLVM values.

Patch by Peng Guo

Reviewers: dsanders, arsenm

Reviewed By: dsanders

Subscribers: wdng, jvesely, nhaehnle, hiraditya, jfb, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D69836
---
 llvm/include/llvm/CodeGen/MIRFormatter.h      |  83 ++++++
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   4 +
 llvm/include/llvm/CodeGen/MachineMemOperand.h |   4 +-
 llvm/include/llvm/CodeGen/MachineOperand.h    |   8 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h |   2 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   2 +
 llvm/include/llvm/Target/TargetMachine.h      |   6 +
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |   1 +
 llvm/lib/CodeGen/MIRParser/MILexer.h          |   1 +
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 265 +++++++++++++-----
 llvm/lib/CodeGen/MIRPrinter.cpp               |  27 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  19 +-
 llvm/lib/CodeGen/MachineOperand.cpp           |  57 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +-
 llvm/lib/Target/TargetMachine.cpp             |   5 +-
 ...llvm.amdgcn.raw.buffer.store.format.f16.ll |  44 +--
 ...llvm.amdgcn.raw.buffer.store.format.f32.ll |  24 +-
 .../llvm.amdgcn.raw.buffer.store.ll           |  66 ++---
 .../regbankselect-amdgcn.image.load.1d.ll     |   8 +-
 .../regbankselect-amdgcn.image.sample.1d.ll   |  10 +-
 .../regbankselect-amdgcn.raw.buffer.load.ll   |  10 +-
 ...regbankselect-amdgcn.struct.buffer.load.ll |  10 +-
 ...egbankselect-amdgcn.struct.buffer.store.ll |  10 +-
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 200 ++++++-------
 .../AMDGPU/extract_subvector_vec4_vec3.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp |   2 +-
 27 files changed, 561 insertions(+), 315 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/MIRFormatter.h

diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
new file mode 100644
index 0000000000000..e57c32c5ae614
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -0,0 +1,83 @@
+//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MIRFormatter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRFORMATTER_H
+#define LLVM_CODEGEN_MIRFORMATTER_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+/// MIRFormater - Interface to format MIR operand based on target
+class MIRFormatter {
+public:
+  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+      ErrorCallbackType;
+
+  MIRFormatter() {}
+  virtual ~MIRFormatter() = default;
+
+  /// Implement target specific printing for machine operand immediate value, so
+  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+  /// None to OpIdx means the index is unknown.
+  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+                        Optional<unsigned> OpIdx, int64_t Imm) const {
+    OS << Imm;
+  }
+
+  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+  /// dot seperated strings.
+  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                                StringRef Src, int64_t &Imm,
+                                ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
+  }
+
+  /// Implement target specific printing of target custom pseudo source value.
+  /// Default implementation is not necessarily the correct MIR serialization
+  /// format.
+  virtual void
+  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
+                               const PseudoSourceValue &PSV) const {
+    PSV.printCustom(OS);
+  }
+
+  /// Implement target specific parsing of target custom pseudo source value.
+  virtual bool parseCustomPseudoSourceValue(
+      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable(
+        "target did not implement parsing MIR custom pseudo source value");
+  }
+
+  /// Helper functions to print IR value as MIR serialization format which will
+  /// be useful for target specific printer, e.g. for printing IR value in
+  /// custom pseudo source value.
+  static void printIRValue(raw_ostream &OS, const Value &V,
+                           ModuleSlotTracker &MST);
+
+  /// Helper functions to parse IR value from MIR serialization format which
+  /// will be useful for target specific parser, e.g. for parsing IR value for
+  /// custom pseudo source value.
+  static bool parseIRValue(StringRef Src, MachineFunction &MF,
+                           PerFunctionMIParsingState &PFS, const Value *&V,
+                           ErrorCallbackType ErrorCallback);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4e32a04551c1c..8ca665b23b280 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,12 +171,16 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
+  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 7ee700c62a25c..b0243646b06c7 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
+class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -295,7 +296,8 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
+             const MIRFormatter *MIRF) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index df914dc2d85e8..4222c03b023a9 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,6 +278,9 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
+  /// \param OpIdx - specifies the index of the operand in machine instruction.
+  /// This will be used by target dependent MIR formatter. Could be None if the
+  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -294,8 +297,9 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
-             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
+             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
+             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
+             const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 4b3cc9145a134..593a865ea5458 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,6 +22,7 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
+class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -52,6 +53,7 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
+  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 4b4cea30b2ba3..e410d1c4806d3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1807,6 +1808,7 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
+  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 176ae39b17a7c..39422ac3bf8ca 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,6 +33,7 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
+class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -94,6 +95,7 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
+  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -197,6 +199,10 @@ class TargetMachine {
     return nullptr;
   }
 
+  /// Return MIR formatter to format/parse MIR operands.  Target can override
+  /// this virtual function and return target specific MIR formatter.
+  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
+
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 21511586ff185..0c35a91f8282e 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,6 +242,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
+      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 1e2eba91ceb53..af5327cacfea5 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,6 +110,7 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
+    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 525c70016a0fb..0f2648e2bfac5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,6 +28,7 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -343,6 +344,37 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -370,8 +402,6 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -455,9 +485,12 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(MachineOperand &Dest,
+  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                           MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
+                                         const unsigned OpIdx,
+                                         MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -473,6 +506,9 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
+  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                              MachineOperand &Dest, const MIRFormatter &MF);
+
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -551,6 +587,9 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
+typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -912,7 +951,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1493,17 +1532,61 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
+bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
+                                      const unsigned OpIdx,
+                                      MachineOperand &Dest,
+                                      const MIRFormatter &MF) {
+  assert(Token.is(MIToken::dot));
+  auto Loc = Token.location(); // record start position
+  size_t Len = 1;              // for "."
+  lex();
+
+  // Handle the case that mnemonic starts with number.
+  if (Token.is(MIToken::IntegerLiteral)) {
+    Len += Token.range().size();
+    lex();
+  }
+
+  StringRef Src;
+  if (Token.is(MIToken::comma))
+    Src = StringRef(Loc, Len);
+  else {
+    assert(Token.is(MIToken::Identifier));
+    Src = StringRef(Loc, Len + Token.stringValue().size());
+  }
+  int64_t Val;
+  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
+                          [this](StringRef::iterator Loc, const Twine &Msg)
+                              -> bool { return error(Loc, Msg); }))
+    return true;
+
+  Dest = MachineOperand::CreateImm(Val);
+  if (!Token.is(MIToken::comma))
+    lex();
+  return false;
+}
+
+static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                            PerFunctionMIParsingState &PFS, const Constant *&C,
+                            ErrorCallbackType ErrCB) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return error(Loc + Err.getColumnNo(), Err.getMessage());
+    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  return ::parseIRConstant(
+      Loc, StringValue, PFS, C,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1636,27 +1719,52 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
+static bool getHexUint(const MIToken &Token, APInt &Result) {
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
+}
+
+bool getUnsigned(const MIToken &Token, unsigned &Result,
+                 ErrorCallbackType ErrCB) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(A))
+    if (getHexUint(Token, A))
       return true;
     if (A.getBitWidth() > 32)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
+bool MIParser::getUnsigned(unsigned &Result) {
+  return ::getUnsigned(
+      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1736,23 +1844,25 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+static bool parseGlobalValue(const MIToken &Token,
+                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
+                             ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = MF.getFunction().getParent();
+    const Module *M = PFS.MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return error(Twine("use of undefined global value '") + Token.range() +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '") +
+                                         Token.range() + "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(GVIdx))
+    if (getUnsigned(Token, GVIdx, ErrCB))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
+                                         Twine(GVIdx) + "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1762,6 +1872,14 @@ bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   return false;
 }
 
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  return ::parseGlobalValue(
+      Token, PFS, GV,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2410,7 +2528,8 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(MachineOperand &Dest,
+bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                                   MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2499,6 +2618,12 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
+  case MIToken::dot: {
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
+    }
+    LLVM_FALLTHROUGH;
+  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2507,7 +2632,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
+    Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2539,7 +2665,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(Dest, TiedDefIdx))
+  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2600,30 +2726,31 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
+static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
+                         const Value *&V, ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(SlotNumber))
+    if (getUnsigned(Token, SlotNumber, ErrCB))
       return true;
-    V = getIRValue(SlotNumber);
+    V = PFS.getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(GV))
+    if (parseGlobalValue(Token, PFS, GV, ErrCB))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
       return true;
     V = C;
     break;
@@ -2632,10 +2759,17 @@ bool MIParser::parseIRValue(const Value *&V) {
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
+bool MIParser::parseIRValue(const Value *&V) {
+  return ::parseIRValue(
+      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2656,20 +2790,7 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
+  return ::getHexUint(Token, Result);
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2756,6 +2877,19 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
+  case MIToken::kw_custom: {
+    lex();
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      if (Formatter->parseCustomPseudoSourceValue(
+              Token.stringValue(), MF, PFS, PSV,
+              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+                return error(Loc, Msg);
+              }))
+        return true;
+    } else
+      return error("unable to parse target custom pseudo source value");
+    break;
+  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2767,7 +2901,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry)) {
+      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3018,35 +3152,8 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
 const Value *MIParser::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
+  return PFS.getIRValue(Slot);
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3111,3 +3218,15 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
+
+bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS, const Value *&V,
+                                ErrorCallbackType ErrorCallback) {
+  MIToken Token;
+  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
+    ErrorCallback(Loc, Msg);
+  });
+  V = nullptr;
+
+  return ::parseIRValue(Token, PFS, V, ErrorCallback);
+}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b06e34a809fca..9d9c12a95918c 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,6 +709,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
+  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -807,7 +808,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
       NeedComma = true;
     }
   }
@@ -856,7 +857,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -874,6 +875,28 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
+void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
+                                ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 16ae732169a48..177fef80e2e68 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,13 +89,15 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII) {
+                               const TargetInstrInfo *&TII,
+                               const MIRFormatter *&MIRF) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
+    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1477,7 +1479,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
+  const MIRFormatter *MIRF = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1506,7 +1509,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1561,7 +1564,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1600,7 +1603,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1611,7 +1614,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1678,7 +1681,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1737,7 +1740,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 8b19501ec3cf1..5dd98467ba663 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -458,28 +459,6 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
-static void printIRValueReference(raw_ostream &OS, const Value &V,
-                                  ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -734,14 +713,15 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
+        /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
-                           bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
+                           bool IsStandalone, bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -802,9 +782,16 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate:
-    OS << getImm();
+  case MachineOperand::MO_Immediate: {
+    const MIRFormatter *Formatter = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      Formatter = MF->getTarget().getMIRFormatter();
+    if (Formatter)
+      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
+    else
+      OS << getImm();
     break;
+  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1070,7 +1057,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII) const {
+                              const TargetInstrInfo *TII,
+                              const MIRFormatter* MIRF) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1111,7 +1099,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    printIRValueReference(OS, *Val, MST);
+    MIRFormatter::printIRValue(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1144,15 +1132,20 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default:
+    default: {
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom ";
-      PVal->printCustom(OS);
+      OS << "custom \"";
+      if (MIRF)
+        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
+      else
+        PVal->printCustom(OS);
+      OS << '\"';
       break;
     }
+    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6fd71393bf38c..a12d708c2dd4d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,7 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII, MF->getTarget().getMIRFormatter());
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 97a1eb2f190a9..f070b143d5b4e 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -37,7 +38,9 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {}
+      DefaultOptions(Options), Options(Options) {
+  MIRF = std::make_unique<MIRFormatter>();
+}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index dd4f892ebc231..ce71a89adacb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index 75d25b0c2c469..aea37fd08b408 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 4db5fe081fda2..c5aa36df8675d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 44a17012237ec..d4a3f4025b378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index c59372a8d09c7..e5d67a3f88742 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 33a8e9a1284cc..4443daba2ee2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 9bc81aecc8a1d..a657488278b04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index efe81eabc3497..a6ba559382f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index f96a13878ba6a..ce62e041aa67e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 0c264251942a4..93322c7da4f86 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index cdcf7383afc09..068e3d98f17be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index faa471f2260c7..7e60fab281545 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");

From 71d64f72f934631aa2f12b9542c23f74f256f494 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 8 Jan 2020 22:50:49 -0500
Subject: [PATCH 21/89] Revert "[MIR] Target specific MIR formating and
 parsing"

This reverts commit 3ef05d85be8c3666ebfa3ad986eb334da5195a47.
It broke check-llvm on many bots, see comments on D69836.
---
 llvm/include/llvm/CodeGen/MIRFormatter.h      |  83 ------
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   4 -
 llvm/include/llvm/CodeGen/MachineMemOperand.h |   4 +-
 llvm/include/llvm/CodeGen/MachineOperand.h    |   8 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h |   2 -
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   2 -
 llvm/include/llvm/Target/TargetMachine.h      |   6 -
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |   1 -
 llvm/lib/CodeGen/MIRParser/MILexer.h          |   1 -
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 265 +++++-------------
 llvm/lib/CodeGen/MIRPrinter.cpp               |  27 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  19 +-
 llvm/lib/CodeGen/MachineOperand.cpp           |  57 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +-
 llvm/lib/Target/TargetMachine.cpp             |   5 +-
 ...llvm.amdgcn.raw.buffer.store.format.f16.ll |  44 +--
 ...llvm.amdgcn.raw.buffer.store.format.f32.ll |  24 +-
 .../llvm.amdgcn.raw.buffer.store.ll           |  66 ++---
 .../regbankselect-amdgcn.image.load.1d.ll     |   8 +-
 .../regbankselect-amdgcn.image.sample.1d.ll   |  10 +-
 .../regbankselect-amdgcn.raw.buffer.load.ll   |  10 +-
 ...regbankselect-amdgcn.struct.buffer.load.ll |  10 +-
 ...egbankselect-amdgcn.struct.buffer.store.ll |  10 +-
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 200 ++++++-------
 .../AMDGPU/extract_subvector_vec4_vec3.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp |   2 +-
 27 files changed, 315 insertions(+), 561 deletions(-)
 delete mode 100644 llvm/include/llvm/CodeGen/MIRFormatter.h

diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
deleted file mode 100644
index e57c32c5ae614..0000000000000
--- a/llvm/include/llvm/CodeGen/MIRFormatter.h
+++ /dev/null
@@ -1,83 +0,0 @@
-//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains the declaration of the MIRFormatter class.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_CODEGEN_MIRFORMATTER_H
-#define LLVM_CODEGEN_MIRFORMATTER_H
-
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
-
-namespace llvm {
-
-struct PerFunctionMIParsingState;
-struct SlotMapping;
-
-/// MIRFormater - Interface to format MIR operand based on target
-class MIRFormatter {
-public:
-  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
-      ErrorCallbackType;
-
-  MIRFormatter() {}
-  virtual ~MIRFormatter() = default;
-
-  /// Implement target specific printing for machine operand immediate value, so
-  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
-  /// None to OpIdx means the index is unknown.
-  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
-                        Optional<unsigned> OpIdx, int64_t Imm) const {
-    OS << Imm;
-  }
-
-  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
-  /// dot seperated strings.
-  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                                StringRef Src, int64_t &Imm,
-                                ErrorCallbackType ErrorCallback) const {
-    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
-  }
-
-  /// Implement target specific printing of target custom pseudo source value.
-  /// Default implementation is not necessarily the correct MIR serialization
-  /// format.
-  virtual void
-  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
-                               const PseudoSourceValue &PSV) const {
-    PSV.printCustom(OS);
-  }
-
-  /// Implement target specific parsing of target custom pseudo source value.
-  virtual bool parseCustomPseudoSourceValue(
-      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
-      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
-    llvm_unreachable(
-        "target did not implement parsing MIR custom pseudo source value");
-  }
-
-  /// Helper functions to print IR value as MIR serialization format which will
-  /// be useful for target specific printer, e.g. for printing IR value in
-  /// custom pseudo source value.
-  static void printIRValue(raw_ostream &OS, const Value &V,
-                           ModuleSlotTracker &MST);
-
-  /// Helper functions to parse IR value from MIR serialization format which
-  /// will be useful for target specific parser, e.g. for parsing IR value for
-  /// custom pseudo source value.
-  static bool parseIRValue(StringRef Src, MachineFunction &MF,
-                           PerFunctionMIParsingState &PFS, const Value *&V,
-                           ErrorCallbackType ErrorCallback);
-};
-
-} // end namespace llvm
-
-#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 8ca665b23b280..4e32a04551c1c 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,16 +171,12 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
-
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
-  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index b0243646b06c7..7ee700c62a25c 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,7 +26,6 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
-class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -296,8 +295,7 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
-             const MIRFormatter *MIRF) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index 4222c03b023a9..df914dc2d85e8 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,9 +278,6 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
-  /// \param OpIdx - specifies the index of the operand in machine instruction.
-  /// This will be used by target dependent MIR formatter. Could be None if the
-  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -297,9 +294,8 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
-             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
-             const TargetRegisterInfo *TRI,
+             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
+             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 593a865ea5458..4b3cc9145a134 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,7 +22,6 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
-class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -53,7 +52,6 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
-  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index e410d1c4806d3..4b4cea30b2ba3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1808,7 +1807,6 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
-  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 39422ac3bf8ca..176ae39b17a7c 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,7 +33,6 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
-class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -95,7 +94,6 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
-  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -199,10 +197,6 @@ class TargetMachine {
     return nullptr;
   }
 
-  /// Return MIR formatter to format/parse MIR operands.  Target can override
-  /// this virtual function and return target specific MIR formatter.
-  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
-
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 0c35a91f8282e..21511586ff185 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,7 +242,6 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
-      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index af5327cacfea5..1e2eba91ceb53 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,7 +110,6 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
-    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 0f2648e2bfac5..525c70016a0fb 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,7 +28,6 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -344,37 +343,6 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
-const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
-}
-
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -402,6 +370,8 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -485,12 +455,9 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
-                           MachineOperand &Dest,
+  bool parseMachineOperand(MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
-                                         const unsigned OpIdx,
-                                         MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -506,9 +473,6 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
-  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                              MachineOperand &Dest, const MIRFormatter &MF);
-
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -587,9 +551,6 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
-typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
-    ErrorCallbackType;
-
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -951,7 +912,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1532,61 +1493,17 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
-                                      const unsigned OpIdx,
-                                      MachineOperand &Dest,
-                                      const MIRFormatter &MF) {
-  assert(Token.is(MIToken::dot));
-  auto Loc = Token.location(); // record start position
-  size_t Len = 1;              // for "."
-  lex();
-
-  // Handle the case that mnemonic starts with number.
-  if (Token.is(MIToken::IntegerLiteral)) {
-    Len += Token.range().size();
-    lex();
-  }
-
-  StringRef Src;
-  if (Token.is(MIToken::comma))
-    Src = StringRef(Loc, Len);
-  else {
-    assert(Token.is(MIToken::Identifier));
-    Src = StringRef(Loc, Len + Token.stringValue().size());
-  }
-  int64_t Val;
-  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
-                          [this](StringRef::iterator Loc, const Twine &Msg)
-                              -> bool { return error(Loc, Msg); }))
-    return true;
-
-  Dest = MachineOperand::CreateImm(Val);
-  if (!Token.is(MIToken::comma))
-    lex();
-  return false;
-}
-
-static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                            PerFunctionMIParsingState &PFS, const Constant *&C,
-                            ErrorCallbackType ErrCB) {
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
+    return error(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
-  return ::parseIRConstant(
-      Loc, StringValue, PFS, C,
-      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1719,52 +1636,27 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-static bool getHexUint(const MIToken &Token, APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
-}
-
-bool getUnsigned(const MIToken &Token, unsigned &Result,
-                 ErrorCallbackType ErrCB) {
+bool MIParser::getUnsigned(unsigned &Result) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+      return error("expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(Token, A))
+    if (getHexUint(A))
       return true;
     if (A.getBitWidth() > 32)
-      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
+      return error("expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
-  return ::getUnsigned(
-      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1844,25 +1736,23 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-static bool parseGlobalValue(const MIToken &Token,
-                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
-                             ErrorCallbackType ErrCB) {
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = PFS.MF.getFunction().getParent();
+    const Module *M = MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return ErrCB(Token.location(), Twine("use of undefined global value '") +
-                                         Token.range() + "'");
+      return error(Twine("use of undefined global value '") + Token.range() +
+                   "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(Token, GVIdx, ErrCB))
+    if (getUnsigned(GVIdx))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
-                                         Twine(GVIdx) + "'");
+      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
+                   "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1872,14 +1762,6 @@ static bool parseGlobalValue(const MIToken &Token,
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
-  return ::parseGlobalValue(
-      Token, PFS, GV,
-      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2528,8 +2410,7 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
-                                   MachineOperand &Dest,
+bool MIParser::parseMachineOperand(MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2618,12 +2499,6 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
-  case MIToken::dot: {
-    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
-      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
-    }
-    LLVM_FALLTHROUGH;
-  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2632,8 +2507,7 @@ bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
-    Optional<unsigned> &TiedDefIdx) {
+    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2665,7 +2539,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
+  if (parseMachineOperand(Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2726,31 +2600,30 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
-                         const Value *&V, ErrorCallbackType ErrCB) {
+bool MIParser::parseIRValue(const Value *&V) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(Token, SlotNumber, ErrCB))
+    if (getUnsigned(SlotNumber))
       return true;
-    V = PFS.getIRValue(SlotNumber);
+    V = getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(Token, PFS, GV, ErrCB))
+    if (parseGlobalValue(GV))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
+    if (parseIRConstant(Token.location(), Token.stringValue(), C))
       return true;
     V = C;
     break;
@@ -2759,17 +2632,10 @@ static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
+    return error(Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
-  return ::parseIRValue(
-      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-        return error(Loc, Msg);
-      });
-}
-
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2790,7 +2656,20 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  return ::getHexUint(Token, Result);
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2877,19 +2756,6 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
-  case MIToken::kw_custom: {
-    lex();
-    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
-      if (Formatter->parseCustomPseudoSourceValue(
-              Token.stringValue(), MF, PFS, PSV,
-              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
-                return error(Loc, Msg);
-              }))
-        return true;
-    } else
-      return error("unable to parse target custom pseudo source value");
-    break;
-  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2901,7 +2767,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
+      Token.is(MIToken::kw_call_entry)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3152,8 +3018,35 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
 const Value *MIParser::getIRValue(unsigned Slot) {
-  return PFS.getIRValue(Slot);
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3218,15 +3111,3 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
-
-bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
-                                PerFunctionMIParsingState &PFS, const Value *&V,
-                                ErrorCallbackType ErrorCallback) {
-  MIToken Token;
-  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
-    ErrorCallback(Loc, Msg);
-  });
-  V = nullptr;
-
-  return ::parseIRValue(Token, PFS, V, ErrorCallback);
-}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 9d9c12a95918c..b06e34a809fca 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,7 +709,6 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
-  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -808,7 +807,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII);
       NeedComma = true;
     }
   }
@@ -857,7 +856,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -875,28 +874,6 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
-void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
-                                ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 177fef80e2e68..16ae732169a48 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,15 +89,13 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII,
-                               const MIRFormatter *&MIRF) {
+                               const TargetInstrInfo *&TII) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
-    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1479,8 +1477,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  const MIRFormatter *MIRF = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1509,7 +1506,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1564,7 +1561,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1603,7 +1600,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1614,7 +1611,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1681,7 +1678,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1740,7 +1737,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 5dd98467ba663..8b19501ec3cf1 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -459,6 +458,28 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
+static void printIRValueReference(raw_ostream &OS, const Value &V,
+                                  ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -713,15 +734,14 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
-        /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
-                           bool IsStandalone, bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
+                           bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -782,16 +802,9 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate: {
-    const MIRFormatter *Formatter = nullptr;
-    if (const MachineFunction *MF = getMFIfAvailable(*this))
-      Formatter = MF->getTarget().getMIRFormatter();
-    if (Formatter)
-      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
-    else
-      OS << getImm();
+  case MachineOperand::MO_Immediate:
+    OS << getImm();
     break;
-  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1057,8 +1070,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII,
-                              const MIRFormatter* MIRF) const {
+                              const TargetInstrInfo *TII) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1099,7 +1111,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    MIRFormatter::printIRValue(OS, *Val, MST);
+    printIRValueReference(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1132,20 +1144,15 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default: {
+    default:
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom \"";
-      if (MIRF)
-        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
-      else
-        PVal->printCustom(OS);
-      OS << '\"';
+      OS << "custom ";
+      PVal->printCustom(OS);
       break;
     }
-    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index a12d708c2dd4d..6fd71393bf38c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,7 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII, MF->getTarget().getMIRFormatter());
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index f070b143d5b4e..97a1eb2f190a9 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -38,9 +37,7 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {
-  MIRF = std::make_unique<MIRFormatter>();
-}
+      DefaultOptions(Options), Options(Options) {}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index ce71a89adacb7..dd4f892ebc231 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index aea37fd08b408..75d25b0c2c469 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index c5aa36df8675d..4db5fe081fda2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index d4a3f4025b378..44a17012237ec 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index e5d67a3f88742..c59372a8d09c7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 4443daba2ee2d..33a8e9a1284cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index a657488278b04..9bc81aecc8a1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index a6ba559382f5c..efe81eabc3497 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index ce62e041aa67e..f96a13878ba6a 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 93322c7da4f86..0c264251942a4 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index 068e3d98f17be..cdcf7383afc09 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index 7e60fab281545..faa471f2260c7 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");

From de3d0ee023cb14c06d5be01369ef8db4cbfa16b4 Mon Sep 17 00:00:00 2001
From: Daniel Sanders <daniel_l_sanders@apple.com>
Date: Wed, 8 Jan 2020 20:02:37 -0800
Subject: [PATCH 22/89] Revert "Revert "[MIR] Target specific MIR formating and
 parsing""

There was an unguarded dereference of MF in a function that permitted
nullptr. Fixed

This reverts commit 71d64f72f934631aa2f12b9542c23f74f256f494.
---
 llvm/include/llvm/CodeGen/MIRFormatter.h      |  83 ++++++
 .../include/llvm/CodeGen/MIRParser/MIParser.h |   4 +
 llvm/include/llvm/CodeGen/MachineMemOperand.h |   4 +-
 llvm/include/llvm/CodeGen/MachineOperand.h    |   8 +-
 llvm/include/llvm/CodeGen/PseudoSourceValue.h |   2 +
 llvm/include/llvm/CodeGen/TargetInstrInfo.h   |   2 +
 llvm/include/llvm/Target/TargetMachine.h      |   6 +
 llvm/lib/CodeGen/MIRParser/MILexer.cpp        |   1 +
 llvm/lib/CodeGen/MIRParser/MILexer.h          |   1 +
 llvm/lib/CodeGen/MIRParser/MIParser.cpp       | 265 +++++++++++++-----
 llvm/lib/CodeGen/MIRPrinter.cpp               |  27 +-
 llvm/lib/CodeGen/MachineInstr.cpp             |  19 +-
 llvm/lib/CodeGen/MachineOperand.cpp           |  57 ++--
 .../SelectionDAG/SelectionDAGDumper.cpp       |   3 +-
 llvm/lib/Target/TargetMachine.cpp             |   5 +-
 ...llvm.amdgcn.raw.buffer.store.format.f16.ll |  44 +--
 ...llvm.amdgcn.raw.buffer.store.format.f32.ll |  24 +-
 .../llvm.amdgcn.raw.buffer.store.ll           |  66 ++---
 .../regbankselect-amdgcn.image.load.1d.ll     |   8 +-
 .../regbankselect-amdgcn.image.sample.1d.ll   |  10 +-
 .../regbankselect-amdgcn.raw.buffer.load.ll   |  10 +-
 ...regbankselect-amdgcn.struct.buffer.load.ll |  10 +-
 ...egbankselect-amdgcn.struct.buffer.store.ll |  10 +-
 .../AMDGPU/buffer-intrinsics-mmo-offsets.ll   | 200 ++++++-------
 .../AMDGPU/extract_subvector_vec4_vec3.ll     |   4 +-
 .../AMDGPU/llvm.amdgcn.ds.gws.barrier.ll      |   2 +-
 llvm/unittests/CodeGen/MachineOperandTest.cpp |   2 +-
 27 files changed, 562 insertions(+), 315 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/MIRFormatter.h

diff --git a/llvm/include/llvm/CodeGen/MIRFormatter.h b/llvm/include/llvm/CodeGen/MIRFormatter.h
new file mode 100644
index 0000000000000..e57c32c5ae614
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MIRFormatter.h
@@ -0,0 +1,83 @@
+//===-- llvm/CodeGen/MIRFormatter.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the declaration of the MIRFormatter class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MIRFORMATTER_H
+#define LLVM_CODEGEN_MIRFORMATTER_H
+
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/PseudoSourceValue.h"
+
+namespace llvm {
+
+struct PerFunctionMIParsingState;
+struct SlotMapping;
+
+/// MIRFormater - Interface to format MIR operand based on target
+class MIRFormatter {
+public:
+  typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+      ErrorCallbackType;
+
+  MIRFormatter() {}
+  virtual ~MIRFormatter() = default;
+
+  /// Implement target specific printing for machine operand immediate value, so
+  /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
+  /// None to OpIdx means the index is unknown.
+  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
+                        Optional<unsigned> OpIdx, int64_t Imm) const {
+    OS << Imm;
+  }
+
+  /// Implement target specific parsing of immediate mnemonics. The mnemonic is
+  /// dot seperated strings.
+  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                                StringRef Src, int64_t &Imm,
+                                ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable("target did not implement parsing MIR immediate mnemonic");
+  }
+
+  /// Implement target specific printing of target custom pseudo source value.
+  /// Default implementation is not necessarily the correct MIR serialization
+  /// format.
+  virtual void
+  printCustomPseudoSourceValue(raw_ostream &OS, ModuleSlotTracker &MST,
+                               const PseudoSourceValue &PSV) const {
+    PSV.printCustom(OS);
+  }
+
+  /// Implement target specific parsing of target custom pseudo source value.
+  virtual bool parseCustomPseudoSourceValue(
+      StringRef Src, MachineFunction &MF, PerFunctionMIParsingState &PFS,
+      const PseudoSourceValue *&PSV, ErrorCallbackType ErrorCallback) const {
+    llvm_unreachable(
+        "target did not implement parsing MIR custom pseudo source value");
+  }
+
+  /// Helper functions to print IR value as MIR serialization format which will
+  /// be useful for target specific printer, e.g. for printing IR value in
+  /// custom pseudo source value.
+  static void printIRValue(raw_ostream &OS, const Value &V,
+                           ModuleSlotTracker &MST);
+
+  /// Helper functions to parse IR value from MIR serialization format which
+  /// will be useful for target specific parser, e.g. for parsing IR value for
+  /// custom pseudo source value.
+  static bool parseIRValue(StringRef Src, MachineFunction &MF,
+                           PerFunctionMIParsingState &PFS, const Value *&V,
+                           ErrorCallbackType ErrorCallback);
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4e32a04551c1c..8ca665b23b280 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -171,12 +171,16 @@ struct PerFunctionMIParsingState {
   DenseMap<unsigned, unsigned> ConstantPoolSlots;
   DenseMap<unsigned, unsigned> JumpTableSlots;
 
+  /// Maps from slot numbers to function's unnamed values.
+  DenseMap<unsigned, const Value *> Slots2Values;
+
   PerFunctionMIParsingState(MachineFunction &MF, SourceMgr &SM,
                             const SlotMapping &IRSlots,
                             PerTargetMIParsingState &Target);
 
   VRegInfo &getVRegInfo(unsigned Num);
   VRegInfo &getVRegInfoNamed(StringRef RegName);
+  const Value *getIRValue(unsigned Slot);
 };
 
 /// Parse the machine basic block definitions, and skip the machine
diff --git a/llvm/include/llvm/CodeGen/MachineMemOperand.h b/llvm/include/llvm/CodeGen/MachineMemOperand.h
index 7ee700c62a25c..b0243646b06c7 100644
--- a/llvm/include/llvm/CodeGen/MachineMemOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineMemOperand.h
@@ -26,6 +26,7 @@ namespace llvm {
 
 class FoldingSetNodeID;
 class MDNode;
+class MIRFormatter;
 class raw_ostream;
 class MachineFunction;
 class ModuleSlotTracker;
@@ -295,7 +296,8 @@ class MachineMemOperand {
   /// @{
   void print(raw_ostream &OS, ModuleSlotTracker &MST,
              SmallVectorImpl<StringRef> &SSNs, const LLVMContext &Context,
-             const MachineFrameInfo *MFI, const TargetInstrInfo *TII) const;
+             const MachineFrameInfo *MFI, const TargetInstrInfo *TII,
+             const MIRFormatter *MIRF) const;
   /// @}
 
   friend bool operator==(const MachineMemOperand &LHS,
diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index df914dc2d85e8..4222c03b023a9 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -278,6 +278,9 @@ class MachineOperand {
   /// More complex way of printing a MachineOperand.
   /// \param TypeToPrint specifies the generic type to be printed on uses and
   /// defs. It can be determined using MachineInstr::getTypeToPrint.
+  /// \param OpIdx - specifies the index of the operand in machine instruction.
+  /// This will be used by target dependent MIR formatter. Could be None if the
+  /// index is unknown, e.g. called by dump().
   /// \param PrintDef - whether we want to print `def` on an operand which
   /// isDef. Sometimes, if the operand is printed before '=', we don't print
   /// `def`.
@@ -294,8 +297,9 @@ class MachineOperand {
   /// information from it's parent.
   /// \param IntrinsicInfo - same as \p TRI.
   void print(raw_ostream &os, ModuleSlotTracker &MST, LLT TypeToPrint,
-             bool PrintDef, bool IsStandalone, bool ShouldPrintRegisterTies,
-             unsigned TiedOperandIdx, const TargetRegisterInfo *TRI,
+             Optional<unsigned> OpIdx, bool PrintDef, bool IsStandalone,
+             bool ShouldPrintRegisterTies, unsigned TiedOperandIdx,
+             const TargetRegisterInfo *TRI,
              const TargetIntrinsicInfo *IntrinsicInfo) const;
 
   /// Same as print(os, TRI, IntrinsicInfo), but allows to specify the low-level
diff --git a/llvm/include/llvm/CodeGen/PseudoSourceValue.h b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
index 4b3cc9145a134..593a865ea5458 100644
--- a/llvm/include/llvm/CodeGen/PseudoSourceValue.h
+++ b/llvm/include/llvm/CodeGen/PseudoSourceValue.h
@@ -22,6 +22,7 @@ namespace llvm {
 
 class MachineFrameInfo;
 class MachineMemOperand;
+class MIRFormatter;
 class raw_ostream;
 class TargetInstrInfo;
 
@@ -52,6 +53,7 @@ class PseudoSourceValue {
                                        const PseudoSourceValue* PSV);
 
   friend class MachineMemOperand; // For printCustom().
+  friend class MIRFormatter;      // For printCustom().
 
   /// Implement printing for PseudoSourceValue. This is called from
   /// Value::print or Value's operator<<.
diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
index 4b4cea30b2ba3..e410d1c4806d3 100644
--- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/None.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -1807,6 +1808,7 @@ class TargetInstrInfo : public MCInstrInfo {
                                                          Register Reg) const;
 
 private:
+  mutable std::unique_ptr<MIRFormatter> Formatter;
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
   unsigned ReturnOpcode;
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 176ae39b17a7c..39422ac3bf8ca 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -33,6 +33,7 @@ class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCSymbol;
+class MIRFormatter;
 class raw_pwrite_stream;
 class PassManagerBuilder;
 struct PerFunctionMIParsingState;
@@ -94,6 +95,7 @@ class TargetMachine {
   std::unique_ptr<const MCRegisterInfo> MRI;
   std::unique_ptr<const MCInstrInfo> MII;
   std::unique_ptr<const MCSubtargetInfo> STI;
+  std::unique_ptr<const MIRFormatter> MIRF;
 
   unsigned RequireStructuredCFG : 1;
   unsigned O0WantsFastISel : 1;
@@ -197,6 +199,10 @@ class TargetMachine {
     return nullptr;
   }
 
+  /// Return MIR formatter to format/parse MIR operands.  Target can override
+  /// this virtual function and return target specific MIR formatter.
+  virtual const MIRFormatter *getMIRFormatter() const { return MIRF.get(); }
+
   bool requiresStructuredCFG() const { return RequireStructuredCFG; }
   void setRequiresStructuredCFG(bool Value) { RequireStructuredCFG = Value; }
 
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 21511586ff185..0c35a91f8282e 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -242,6 +242,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("jump-table", MIToken::kw_jump_table)
       .Case("constant-pool", MIToken::kw_constant_pool)
       .Case("call-entry", MIToken::kw_call_entry)
+      .Case("custom", MIToken::kw_custom)
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index 1e2eba91ceb53..af5327cacfea5 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -110,6 +110,7 @@ struct MIToken {
     kw_jump_table,
     kw_constant_pool,
     kw_call_entry,
+    kw_custom,
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 525c70016a0fb..0f2648e2bfac5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -28,6 +28,7 @@
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -343,6 +344,37 @@ VRegInfo &PerFunctionMIParsingState::getVRegInfoNamed(StringRef RegName) {
   return *I.first->second;
 }
 
+static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
+                           DenseMap<unsigned, const Value *> &Slots2Values) {
+  int Slot = MST.getLocalSlot(V);
+  if (Slot == -1)
+    return;
+  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
+}
+
+/// Creates the mapping from slot numbers to function's unnamed IR values.
+static void initSlots2Values(const Function &F,
+                             DenseMap<unsigned, const Value *> &Slots2Values) {
+  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
+  MST.incorporateFunction(F);
+  for (const auto &Arg : F.args())
+    mapValueToSlot(&Arg, MST, Slots2Values);
+  for (const auto &BB : F) {
+    mapValueToSlot(&BB, MST, Slots2Values);
+    for (const auto &I : BB)
+      mapValueToSlot(&I, MST, Slots2Values);
+  }
+}
+
+const Value* PerFunctionMIParsingState::getIRValue(unsigned Slot) {
+  if (Slots2Values.empty())
+    initSlots2Values(MF.getFunction(), Slots2Values);
+  auto ValueInfo = Slots2Values.find(Slot);
+  if (ValueInfo == Slots2Values.end())
+    return nullptr;
+  return ValueInfo->second;
+}
+
 namespace {
 
 /// A wrapper struct around the 'MachineOperand' struct that includes a source
@@ -370,8 +402,6 @@ class MIParser {
   PerFunctionMIParsingState &PFS;
   /// Maps from slot numbers to function's unnamed basic blocks.
   DenseMap<unsigned, const BasicBlock *> Slots2BasicBlocks;
-  /// Maps from slot numbers to function's unnamed values.
-  DenseMap<unsigned, const Value *> Slots2Values;
 
 public:
   MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
@@ -455,9 +485,12 @@ class MIParser {
   bool parseTargetIndexOperand(MachineOperand &Dest);
   bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
-  bool parseMachineOperand(MachineOperand &Dest,
+  bool parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                           MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
-  bool parseMachineOperandAndTargetFlags(MachineOperand &Dest,
+  bool parseMachineOperandAndTargetFlags(const unsigned OpCode,
+                                         const unsigned OpIdx,
+                                         MachineOperand &Dest,
                                          Optional<unsigned> &TiedDefIdx);
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
@@ -473,6 +506,9 @@ class MIParser {
   bool parsePreOrPostInstrSymbol(MCSymbol *&Symbol);
   bool parseHeapAllocMarker(MDNode *&Node);
 
+  bool parseTargetImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                              MachineOperand &Dest, const MIRFormatter &MF);
+
 private:
   /// Convert the integer literal in the current token into an unsigned integer.
   ///
@@ -551,6 +587,9 @@ bool MIParser::error(StringRef::iterator Loc, const Twine &Msg) {
   return true;
 }
 
+typedef function_ref<bool(StringRef::iterator Loc, const Twine &)>
+    ErrorCallbackType;
+
 static const char *toString(MIToken::TokenKind TokenKind) {
   switch (TokenKind) {
   case MIToken::comma:
@@ -912,7 +951,7 @@ bool MIParser::parse(MachineInstr *&MI) {
          Token.isNot(MIToken::coloncolon) && Token.isNot(MIToken::lbrace)) {
     auto Loc = Token.location();
     Optional<unsigned> TiedDefIdx;
-    if (parseMachineOperandAndTargetFlags(MO, TiedDefIdx))
+    if (parseMachineOperandAndTargetFlags(OpCode, Operands.size(), MO, TiedDefIdx))
       return true;
     if (OpCode == TargetOpcode::DBG_VALUE && MO.isReg())
       MO.setIsDebug();
@@ -1493,17 +1532,61 @@ bool MIParser::parseImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
-                               const Constant *&C) {
+bool MIParser::parseTargetImmMnemonic(const unsigned OpCode,
+                                      const unsigned OpIdx,
+                                      MachineOperand &Dest,
+                                      const MIRFormatter &MF) {
+  assert(Token.is(MIToken::dot));
+  auto Loc = Token.location(); // record start position
+  size_t Len = 1;              // for "."
+  lex();
+
+  // Handle the case that mnemonic starts with number.
+  if (Token.is(MIToken::IntegerLiteral)) {
+    Len += Token.range().size();
+    lex();
+  }
+
+  StringRef Src;
+  if (Token.is(MIToken::comma))
+    Src = StringRef(Loc, Len);
+  else {
+    assert(Token.is(MIToken::Identifier));
+    Src = StringRef(Loc, Len + Token.stringValue().size());
+  }
+  int64_t Val;
+  if (MF.parseImmMnemonic(OpCode, OpIdx, Src, Val,
+                          [this](StringRef::iterator Loc, const Twine &Msg)
+                              -> bool { return error(Loc, Msg); }))
+    return true;
+
+  Dest = MachineOperand::CreateImm(Val);
+  if (!Token.is(MIToken::comma))
+    lex();
+  return false;
+}
+
+static bool parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                            PerFunctionMIParsingState &PFS, const Constant *&C,
+                            ErrorCallbackType ErrCB) {
   auto Source = StringValue.str(); // The source has to be null terminated.
   SMDiagnostic Err;
-  C = parseConstantValue(Source, Err, *MF.getFunction().getParent(),
+  C = parseConstantValue(Source, Err, *PFS.MF.getFunction().getParent(),
                          &PFS.IRSlots);
   if (!C)
-    return error(Loc + Err.getColumnNo(), Err.getMessage());
+    return ErrCB(Loc + Err.getColumnNo(), Err.getMessage());
   return false;
 }
 
+bool MIParser::parseIRConstant(StringRef::iterator Loc, StringRef StringValue,
+                               const Constant *&C) {
+  return ::parseIRConstant(
+      Loc, StringValue, PFS, C,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseIRConstant(StringRef::iterator Loc, const Constant *&C) {
   if (parseIRConstant(Loc, StringRef(Loc, Token.range().end() - Loc), C))
     return true;
@@ -1636,27 +1719,52 @@ bool MIParser::parseFPImmediateOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::getUnsigned(unsigned &Result) {
+static bool getHexUint(const MIToken &Token, APInt &Result) {
+  assert(Token.is(MIToken::HexLiteral));
+  StringRef S = Token.range();
+  assert(S[0] == '0' && tolower(S[1]) == 'x');
+  // This could be a floating point literal with a special prefix.
+  if (!isxdigit(S[2]))
+    return true;
+  StringRef V = S.substr(2);
+  APInt A(V.size()*4, V, 16);
+
+  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
+  // sure it isn't the case before constructing result.
+  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
+  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
+  return false;
+}
+
+bool getUnsigned(const MIToken &Token, unsigned &Result,
+                 ErrorCallbackType ErrCB) {
   if (Token.hasIntegerValue()) {
     const uint64_t Limit = uint64_t(std::numeric_limits<unsigned>::max()) + 1;
     uint64_t Val64 = Token.integerValue().getLimitedValue(Limit);
     if (Val64 == Limit)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = Val64;
     return false;
   }
   if (Token.is(MIToken::HexLiteral)) {
     APInt A;
-    if (getHexUint(A))
+    if (getHexUint(Token, A))
       return true;
     if (A.getBitWidth() > 32)
-      return error("expected 32-bit integer (too large)");
+      return ErrCB(Token.location(), "expected 32-bit integer (too large)");
     Result = A.getZExtValue();
     return false;
   }
   return true;
 }
 
+bool MIParser::getUnsigned(unsigned &Result) {
+  return ::getUnsigned(
+      Token, Result, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseMBBReference(MachineBasicBlock *&MBB) {
   assert(Token.is(MIToken::MachineBasicBlock) ||
          Token.is(MIToken::MachineBasicBlockLabel));
@@ -1736,23 +1844,25 @@ bool MIParser::parseFixedStackObjectOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+static bool parseGlobalValue(const MIToken &Token,
+                             PerFunctionMIParsingState &PFS, GlobalValue *&GV,
+                             ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedGlobalValue: {
-    const Module *M = MF.getFunction().getParent();
+    const Module *M = PFS.MF.getFunction().getParent();
     GV = M->getNamedValue(Token.stringValue());
     if (!GV)
-      return error(Twine("use of undefined global value '") + Token.range() +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '") +
+                                         Token.range() + "'");
     break;
   }
   case MIToken::GlobalValue: {
     unsigned GVIdx;
-    if (getUnsigned(GVIdx))
+    if (getUnsigned(Token, GVIdx, ErrCB))
       return true;
     if (GVIdx >= PFS.IRSlots.GlobalValues.size())
-      return error(Twine("use of undefined global value '@") + Twine(GVIdx) +
-                   "'");
+      return ErrCB(Token.location(), Twine("use of undefined global value '@") +
+                                         Twine(GVIdx) + "'");
     GV = PFS.IRSlots.GlobalValues[GVIdx];
     break;
   }
@@ -1762,6 +1872,14 @@ bool MIParser::parseGlobalValue(GlobalValue *&GV) {
   return false;
 }
 
+bool MIParser::parseGlobalValue(GlobalValue *&GV) {
+  return ::parseGlobalValue(
+      Token, PFS, GV,
+      [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::parseGlobalAddressOperand(MachineOperand &Dest) {
   GlobalValue *GV = nullptr;
   if (parseGlobalValue(GV))
@@ -2410,7 +2528,8 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   return false;
 }
 
-bool MIParser::parseMachineOperand(MachineOperand &Dest,
+bool MIParser::parseMachineOperand(const unsigned OpCode, const unsigned OpIdx,
+                                   MachineOperand &Dest,
                                    Optional<unsigned> &TiedDefIdx) {
   switch (Token.kind()) {
   case MIToken::kw_implicit:
@@ -2499,6 +2618,12 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       return parseCustomRegisterMaskOperand(Dest);
     } else
       return parseTypedImmediateOperand(Dest);
+  case MIToken::dot: {
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      return parseTargetImmMnemonic(OpCode, OpIdx, Dest, *Formatter);
+    }
+    LLVM_FALLTHROUGH;
+  }
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2507,7 +2632,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
 }
 
 bool MIParser::parseMachineOperandAndTargetFlags(
-    MachineOperand &Dest, Optional<unsigned> &TiedDefIdx) {
+    const unsigned OpCode, const unsigned OpIdx, MachineOperand &Dest,
+    Optional<unsigned> &TiedDefIdx) {
   unsigned TF = 0;
   bool HasTargetFlags = false;
   if (Token.is(MIToken::kw_target_flags)) {
@@ -2539,7 +2665,7 @@ bool MIParser::parseMachineOperandAndTargetFlags(
       return true;
   }
   auto Loc = Token.location();
-  if (parseMachineOperand(Dest, TiedDefIdx))
+  if (parseMachineOperand(OpCode, OpIdx, Dest, TiedDefIdx))
     return true;
   if (!HasTargetFlags)
     return false;
@@ -2600,30 +2726,31 @@ bool MIParser::parseOperandsOffset(MachineOperand &Op) {
   return false;
 }
 
-bool MIParser::parseIRValue(const Value *&V) {
+static bool parseIRValue(const MIToken &Token, PerFunctionMIParsingState &PFS,
+                         const Value *&V, ErrorCallbackType ErrCB) {
   switch (Token.kind()) {
   case MIToken::NamedIRValue: {
-    V = MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
+    V = PFS.MF.getFunction().getValueSymbolTable()->lookup(Token.stringValue());
     break;
   }
   case MIToken::IRValue: {
     unsigned SlotNumber = 0;
-    if (getUnsigned(SlotNumber))
+    if (getUnsigned(Token, SlotNumber, ErrCB))
       return true;
-    V = getIRValue(SlotNumber);
+    V = PFS.getIRValue(SlotNumber);
     break;
   }
   case MIToken::NamedGlobalValue:
   case MIToken::GlobalValue: {
     GlobalValue *GV = nullptr;
-    if (parseGlobalValue(GV))
+    if (parseGlobalValue(Token, PFS, GV, ErrCB))
       return true;
     V = GV;
     break;
   }
   case MIToken::QuotedIRValue: {
     const Constant *C = nullptr;
-    if (parseIRConstant(Token.location(), Token.stringValue(), C))
+    if (parseIRConstant(Token.location(), Token.stringValue(), PFS, C, ErrCB))
       return true;
     V = C;
     break;
@@ -2632,10 +2759,17 @@ bool MIParser::parseIRValue(const Value *&V) {
     llvm_unreachable("The current token should be an IR block reference");
   }
   if (!V)
-    return error(Twine("use of undefined IR value '") + Token.range() + "'");
+    return ErrCB(Token.location(), Twine("use of undefined IR value '") + Token.range() + "'");
   return false;
 }
 
+bool MIParser::parseIRValue(const Value *&V) {
+  return ::parseIRValue(
+      Token, PFS, V, [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+        return error(Loc, Msg);
+      });
+}
+
 bool MIParser::getUint64(uint64_t &Result) {
   if (Token.hasIntegerValue()) {
     if (Token.integerValue().getActiveBits() > 64)
@@ -2656,20 +2790,7 @@ bool MIParser::getUint64(uint64_t &Result) {
 }
 
 bool MIParser::getHexUint(APInt &Result) {
-  assert(Token.is(MIToken::HexLiteral));
-  StringRef S = Token.range();
-  assert(S[0] == '0' && tolower(S[1]) == 'x');
-  // This could be a floating point literal with a special prefix.
-  if (!isxdigit(S[2]))
-    return true;
-  StringRef V = S.substr(2);
-  APInt A(V.size()*4, V, 16);
-
-  // If A is 0, then A.getActiveBits() is 0. This isn't a valid bitwidth. Make
-  // sure it isn't the case before constructing result.
-  unsigned NumBits = (A == 0) ? 32 : A.getActiveBits();
-  Result = APInt(NumBits, ArrayRef<uint64_t>(A.getRawData(), A.getNumWords()));
-  return false;
+  return ::getHexUint(Token, Result);
 }
 
 bool MIParser::parseMemoryOperandFlag(MachineMemOperand::Flags &Flags) {
@@ -2756,6 +2877,19 @@ bool MIParser::parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV) {
           "expected a global value or an external symbol after 'call-entry'");
     }
     break;
+  case MIToken::kw_custom: {
+    lex();
+    if (const auto *Formatter = MF.getTarget().getMIRFormatter()) {
+      if (Formatter->parseCustomPseudoSourceValue(
+              Token.stringValue(), MF, PFS, PSV,
+              [this](StringRef::iterator Loc, const Twine &Msg) -> bool {
+                return error(Loc, Msg);
+              }))
+        return true;
+    } else
+      return error("unable to parse target custom pseudo source value");
+    break;
+  }
   default:
     llvm_unreachable("The current token should be pseudo source value");
   }
@@ -2767,7 +2901,7 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   if (Token.is(MIToken::kw_constant_pool) || Token.is(MIToken::kw_stack) ||
       Token.is(MIToken::kw_got) || Token.is(MIToken::kw_jump_table) ||
       Token.is(MIToken::FixedStackObject) || Token.is(MIToken::StackObject) ||
-      Token.is(MIToken::kw_call_entry)) {
+      Token.is(MIToken::kw_call_entry) || Token.is(MIToken::kw_custom)) {
     const PseudoSourceValue *PSV = nullptr;
     if (parseMemoryPseudoSourceValue(PSV))
       return true;
@@ -3018,35 +3152,8 @@ const BasicBlock *MIParser::getIRBlock(unsigned Slot, const Function &F) {
   return getIRBlockFromSlot(Slot, CustomSlots2BasicBlocks);
 }
 
-static void mapValueToSlot(const Value *V, ModuleSlotTracker &MST,
-                           DenseMap<unsigned, const Value *> &Slots2Values) {
-  int Slot = MST.getLocalSlot(V);
-  if (Slot == -1)
-    return;
-  Slots2Values.insert(std::make_pair(unsigned(Slot), V));
-}
-
-/// Creates the mapping from slot numbers to function's unnamed IR values.
-static void initSlots2Values(const Function &F,
-                             DenseMap<unsigned, const Value *> &Slots2Values) {
-  ModuleSlotTracker MST(F.getParent(), /*ShouldInitializeAllMetadata=*/false);
-  MST.incorporateFunction(F);
-  for (const auto &Arg : F.args())
-    mapValueToSlot(&Arg, MST, Slots2Values);
-  for (const auto &BB : F) {
-    mapValueToSlot(&BB, MST, Slots2Values);
-    for (const auto &I : BB)
-      mapValueToSlot(&I, MST, Slots2Values);
-  }
-}
-
 const Value *MIParser::getIRValue(unsigned Slot) {
-  if (Slots2Values.empty())
-    initSlots2Values(MF.getFunction(), Slots2Values);
-  auto ValueInfo = Slots2Values.find(Slot);
-  if (ValueInfo == Slots2Values.end())
-    return nullptr;
-  return ValueInfo->second;
+  return PFS.getIRValue(Slot);
 }
 
 MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
@@ -3111,3 +3218,15 @@ bool llvm::parseMDNode(PerFunctionMIParsingState &PFS,
                        MDNode *&Node, StringRef Src, SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneMDNode(Node);
 }
+
+bool MIRFormatter::parseIRValue(StringRef Src, MachineFunction &MF,
+                                PerFunctionMIParsingState &PFS, const Value *&V,
+                                ErrorCallbackType ErrorCallback) {
+  MIToken Token;
+  Src = lexMIToken(Src, Token, [&](StringRef::iterator Loc, const Twine &Msg) {
+    ErrorCallback(Loc, Msg);
+  });
+  V = nullptr;
+
+  return ::parseIRValue(Token, PFS, V, ErrorCallback);
+}
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index b06e34a809fca..9d9c12a95918c 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -709,6 +709,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   const auto *TRI = SubTarget.getRegisterInfo();
   assert(TRI && "Expected target register info");
   const auto *TII = SubTarget.getInstrInfo();
+  const auto *MIRF = MF->getTarget().getMIRFormatter();
   assert(TII && "Expected target instruction info");
   if (MI.isCFIInstruction())
     assert(MI.getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -807,7 +808,7 @@ void MIPrinter::print(const MachineInstr &MI) {
     for (const auto *Op : MI.memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, Context, &MFI, TII);
+      Op->print(OS, MST, SSNs, Context, &MFI, TII, MIRF);
       NeedComma = true;
     }
   }
@@ -856,7 +857,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     if (ShouldPrintRegisterTies && Op.isReg() && Op.isTied() && !Op.isDef())
       TiedOperandIdx = Op.getParent()->findTiedOperandIdx(OpIdx);
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
-    Op.print(OS, MST, TypeToPrint, PrintDef, /*IsStandalone=*/false,
+    Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
     break;
   }
@@ -874,6 +875,28 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
   }
 }
 
+void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V,
+                                ModuleSlotTracker &MST) {
+  if (isa<GlobalValue>(V)) {
+    V.printAsOperand(OS, /*PrintType=*/false, MST);
+    return;
+  }
+  if (isa<Constant>(V)) {
+    // Machine memory operands can load/store to/from constant value pointers.
+    OS << '`';
+    V.printAsOperand(OS, /*PrintType=*/true, MST);
+    OS << '`';
+    return;
+  }
+  OS << "%ir.";
+  if (V.hasName()) {
+    printLLVMNameWithoutPrefix(OS, V.getName());
+    return;
+  }
+  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
+  MachineOperand::printIRSlotNumber(OS, Slot);
+}
+
 void llvm::printMIR(raw_ostream &OS, const Module &M) {
   yaml::Output Out(OS);
   Out << const_cast<Module &>(M);
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 16ae732169a48..177fef80e2e68 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -89,13 +89,15 @@ static void tryToGetTargetInfo(const MachineInstr &MI,
                                const TargetRegisterInfo *&TRI,
                                const MachineRegisterInfo *&MRI,
                                const TargetIntrinsicInfo *&IntrinsicInfo,
-                               const TargetInstrInfo *&TII) {
+                               const TargetInstrInfo *&TII,
+                               const MIRFormatter *&MIRF) {
 
   if (const MachineFunction *MF = getMFIfAvailable(MI)) {
     TRI = MF->getSubtarget().getRegisterInfo();
     MRI = &MF->getRegInfo();
     IntrinsicInfo = MF->getTarget().getIntrinsicInfo();
     TII = MF->getSubtarget().getInstrInfo();
+    MIRF = MF->getTarget().getMIRFormatter();
   }
 }
 
@@ -1477,7 +1479,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
-  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII);
+  const MIRFormatter *MIRF = nullptr;
+  tryToGetTargetInfo(*this, TRI, MRI, IntrinsicInfo, TII, MIRF);
 
   if (isCFIInstruction())
     assert(getNumOperands() == 1 && "Expected 1 operand in CFI instruction");
@@ -1506,7 +1509,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
     LLT TypeToPrint = MRI ? getTypeToPrint(StartOp, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(StartOp);
-    MO.print(OS, MST, TypeToPrint, /*PrintDef=*/false, IsStandalone,
+    MO.print(OS, MST, TypeToPrint, StartOp, /*PrintDef=*/false, IsStandalone,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     ++StartOp;
   }
@@ -1561,7 +1564,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     const unsigned OpIdx = InlineAsm::MIOp_AsmString;
     LLT TypeToPrint = MRI ? getTypeToPrint(OpIdx, PrintedTypes, *MRI) : LLT{};
     unsigned TiedOperandIdx = getTiedOperandIdx(OpIdx);
-    getOperand(OpIdx).print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+    getOperand(OpIdx).print(OS, MST, TypeToPrint, OpIdx, /*PrintDef=*/true, IsStandalone,
                             ShouldPrintRegisterTies, TiedOperandIdx, TRI,
                             IntrinsicInfo);
 
@@ -1600,7 +1603,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (isDebugLabel() && MO.isMetadata()) {
@@ -1611,7 +1614,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       else {
         LLT TypeToPrint = MRI ? getTypeToPrint(i, PrintedTypes, *MRI) : LLT{};
         unsigned TiedOperandIdx = getTiedOperandIdx(i);
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
       }
     } else if (i == AsmDescOp && MO.isImm()) {
@@ -1678,7 +1681,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       if (MO.isImm() && isOperandSubregIdx(i))
         MachineOperand::printSubRegIdx(OS, MO.getImm(), TRI);
       else
-        MO.print(OS, MST, TypeToPrint, /*PrintDef=*/true, IsStandalone,
+        MO.print(OS, MST, TypeToPrint, i, /*PrintDef=*/true, IsStandalone,
                  ShouldPrintRegisterTies, TiedOperandIdx, TRI, IntrinsicInfo);
     }
   }
@@ -1737,7 +1740,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     for (const MachineMemOperand *Op : memoperands()) {
       if (NeedComma)
         OS << ", ";
-      Op->print(OS, MST, SSNs, *Context, MFI, TII);
+      Op->print(OS, MST, SSNs, *Context, MFI, TII, MIRF);
       NeedComma = true;
     }
   }
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 8b19501ec3cf1..5dd98467ba663 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/CodeGen/MIRPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
@@ -458,28 +459,6 @@ static void printIRBlockReference(raw_ostream &OS, const BasicBlock &BB,
     OS << "<unknown>";
 }
 
-static void printIRValueReference(raw_ostream &OS, const Value &V,
-                                  ModuleSlotTracker &MST) {
-  if (isa<GlobalValue>(V)) {
-    V.printAsOperand(OS, /*PrintType=*/false, MST);
-    return;
-  }
-  if (isa<Constant>(V)) {
-    // Machine memory operands can load/store to/from constant value pointers.
-    OS << '`';
-    V.printAsOperand(OS, /*PrintType=*/true, MST);
-    OS << '`';
-    return;
-  }
-  OS << "%ir.";
-  if (V.hasName()) {
-    printLLVMNameWithoutPrefix(OS, V.getName());
-    return;
-  }
-  int Slot = MST.getCurrentFunction() ? MST.getLocalSlot(&V) : -1;
-  MachineOperand::printIRSlotNumber(OS, Slot);
-}
-
 static void printSyncScope(raw_ostream &OS, const LLVMContext &Context,
                            SyncScope::ID SSID,
                            SmallVectorImpl<StringRef> &SSNs) {
@@ -734,14 +713,15 @@ void MachineOperand::print(raw_ostream &OS, LLT TypeToPrint,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
   tryToGetTargetInfo(*this, TRI, IntrinsicInfo);
   ModuleSlotTracker DummyMST(nullptr);
-  print(OS, DummyMST, TypeToPrint, /*PrintDef=*/false, /*IsStandalone=*/true,
+  print(OS, DummyMST, TypeToPrint, None, /*PrintDef=*/false,
+        /*IsStandalone=*/true,
         /*ShouldPrintRegisterTies=*/true,
         /*TiedOperandIdx=*/0, TRI, IntrinsicInfo);
 }
 
 void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                           LLT TypeToPrint, bool PrintDef, bool IsStandalone,
-                           bool ShouldPrintRegisterTies,
+                           LLT TypeToPrint, Optional<unsigned> OpIdx, bool PrintDef,
+                           bool IsStandalone, bool ShouldPrintRegisterTies,
                            unsigned TiedOperandIdx,
                            const TargetRegisterInfo *TRI,
                            const TargetIntrinsicInfo *IntrinsicInfo) const {
@@ -802,9 +782,16 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       OS << '(' << TypeToPrint << ')';
     break;
   }
-  case MachineOperand::MO_Immediate:
-    OS << getImm();
+  case MachineOperand::MO_Immediate: {
+    const MIRFormatter *Formatter = nullptr;
+    if (const MachineFunction *MF = getMFIfAvailable(*this))
+      Formatter = MF->getTarget().getMIRFormatter();
+    if (Formatter)
+      Formatter->printImm(OS, *getParent(), OpIdx, getImm());
+    else
+      OS << getImm();
     break;
+  }
   case MachineOperand::MO_CImmediate:
     getCImm()->printAsOperand(OS, /*PrintType=*/true, MST);
     break;
@@ -1070,7 +1057,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
                               SmallVectorImpl<StringRef> &SSNs,
                               const LLVMContext &Context,
                               const MachineFrameInfo *MFI,
-                              const TargetInstrInfo *TII) const {
+                              const TargetInstrInfo *TII,
+                              const MIRFormatter* MIRF) const {
   OS << '(';
   if (isVolatile())
     OS << "volatile ";
@@ -1111,7 +1099,7 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
   if (const Value *Val = getValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
-    printIRValueReference(OS, *Val, MST);
+    MIRFormatter::printIRValue(OS, *Val, MST);
   } else if (const PseudoSourceValue *PVal = getPseudoValue()) {
     OS << ((isLoad() && isStore()) ? " on " : isLoad() ? " from " : " into ");
     assert(PVal && "Expected a pseudo source value");
@@ -1144,15 +1132,20 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       printLLVMNameWithoutPrefix(
           OS, cast<ExternalSymbolPseudoSourceValue>(PVal)->getSymbol());
       break;
-    default:
+    default: {
       // FIXME: This is not necessarily the correct MIR serialization format for
       // a custom pseudo source value, but at least it allows
       // -print-machineinstrs to work on a target with custom pseudo source
       // values.
-      OS << "custom ";
-      PVal->printCustom(OS);
+      OS << "custom \"";
+      if (MIRF)
+        MIRF->printCustomPseudoSourceValue(OS, MST, *PVal);
+      else
+        PVal->printCustom(OS);
+      OS << '\"';
       break;
     }
+    }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
   if (getBaseAlignment() != getSize())
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6fd71393bf38c..cbdcb93e60c90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -480,7 +480,8 @@ static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
   if (MF)
     MST.incorporateFunction(MF->getFunction());
   SmallVector<StringRef, 0> SSNs;
-  MMO.print(OS, MST, SSNs, Ctx, MFI, TII);
+  MMO.print(OS, MST, SSNs, Ctx, MFI, TII,
+            MF ? MF->getTarget().getMIRFormatter() : nullptr);
 }
 
 static void printMemOperand(raw_ostream &OS, const MachineMemOperand &MMO,
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 97a1eb2f190a9..f070b143d5b4e 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/MIRFormatter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
@@ -37,7 +38,9 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
     : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
       TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
       RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {}
+      DefaultOptions(Options), Options(Options) {
+  MIRF = std::make_unique<MIRFormatter>();
+}
 
 TargetMachine::~TargetMachine() = default;
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
index dd4f892ebc231..ce71a89adacb7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f16.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; UNPACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -27,7 +27,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -44,7 +44,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; UNPACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; UNPACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; UNPACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_4095__sgpr_soffset_f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -56,7 +56,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; PACKED:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; PACKED:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f16(half %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -78,7 +78,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -91,7 +91,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -116,7 +116,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY9]], [[COPY5]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[V_LSHRREV_B32_e64_1]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v4f16
   ; PACKED: bb.1 (%ir-block.0):
@@ -131,7 +131,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -211,7 +211,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -240,7 +240,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -253,7 +253,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -275,7 +275,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY6]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_soffset4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -288,7 +288,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -310,7 +310,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_16
   ; PACKED: bb.1 (%ir-block.0):
@@ -323,7 +323,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -346,7 +346,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4095
   ; PACKED: bb.1 (%ir-block.0):
@@ -359,7 +359,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; PACKED:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -384,7 +384,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED:   [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 [[COPY7]], [[COPY4]], implicit $exec
   ; UNPACKED:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[V_LSHRREV_B32_e64_]], %subreg.sub1
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %23, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   S_ENDPGM 0
   ; PACKED-LABEL: name: raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr_soffset_v2f16_voffset_add_4096
   ; PACKED: bb.1 (%ir-block.0):
@@ -399,7 +399,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; PACKED:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; PACKED:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -445,7 +445,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; UNPACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; UNPACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; UNPACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; UNPACKED:   BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact [[REG_SEQUENCE1]], %48, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; UNPACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; UNPACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; UNPACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -485,7 +485,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; PACKED:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; PACKED:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; PACKED:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; PACKED:   BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %32, [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; PACKED:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; PACKED:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; PACKED:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
index 75d25b0c2c469..aea37fd08b408 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.format.f32.ll
@@ -14,7 +14,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -31,7 +31,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__voffset_409
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_X_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -51,7 +51,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -72,7 +72,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZ_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -94,7 +94,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -132,7 +132,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -159,7 +159,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -179,7 +179,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -199,7 +199,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -220,7 +220,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -243,7 +243,7 @@ define amdgpu_ps void @raw_buffer_store_format__sgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %16:vgpr_32, dead %17:sreg_64 = V_ADD_I32_e64 [[COPY6]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XY_OFFEN_exact [[REG_SEQUENCE1]], %16, [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.format.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -286,7 +286,7 @@ define amdgpu_ps void @raw_buffer_store_format__vgpr_rsrc__vgpr_val__vgpr_voffse
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY11]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact [[REG_SEQUENCE1]], %34, [[REG_SEQUENCE4]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
index 4db5fe081fda2..c5aa36df8675d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.store.ll
@@ -15,7 +15,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -36,7 +36,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY4]]
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[COPY5]]
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY7]], [[COPY8]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -71,7 +71,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -103,7 +103,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   successors: %bb.3(0x40000000), %bb.2(0x40000000)
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]], [[COPY6]], implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[V_READFIRSTLANE_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -148,7 +148,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 [[COPY6]], implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]], [[COPY6]], implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE3]], [[V_READFIRSTLANE_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -173,7 +173,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 1)
   ret void
@@ -191,7 +191,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 2)
   ret void
@@ -209,7 +209,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 3)
   ret void
@@ -227,7 +227,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 4)
   ret void
@@ -245,7 +245,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 6)
   ret void
@@ -263,7 +263,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 0, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 5)
   ret void
@@ -281,7 +281,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 1, 1, 0, 1, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 7)
   ret void
@@ -301,7 +301,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f32(<2 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -322,7 +322,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY8:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2
-  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX3_OFFEN_exact [[REG_SEQUENCE1]], [[COPY7]], [[REG_SEQUENCE]], [[COPY8]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v3f32(<3 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -344,7 +344,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY9:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1, [[COPY6]], %subreg.sub2, [[COPY7]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX4_OFFEN_exact [[REG_SEQUENCE1]], [[COPY8]], [[REG_SEQUENCE]], [[COPY9]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f32(<4 x float> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -362,7 +362,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom TargetCustom7, addrspace 4)
+  ; CHECK:   BUFFER_STORE_BYTE_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 1 into custom "TargetCustom7", addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i8
   call void @llvm.amdgcn.raw.buffer.store.i8(i8 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -381,7 +381,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %val.trunc = trunc i32 %val to i16
   call void @llvm.amdgcn.raw.buffer.store.i16(i16 %val.trunc, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -400,7 +400,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_SHORT_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 2 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f16(half %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -418,7 +418,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -438,7 +438,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY7:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY4]], %subreg.sub0, [[COPY5]], %subreg.sub1
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v4f16(<4 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -474,7 +474,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE3]], [[COPY9]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE4:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORDX2_OFFEN_exact [[REG_SEQUENCE1]], [[COPY6]], [[REG_SEQUENCE4]], [[COPY7]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 8 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -498,7 +498,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], $noreg, [[REG_SEQUENCE]], [[COPY5]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4095, i32 %soffset, i32 0)
   ret void
@@ -516,7 +516,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__sgpr_soffset_f32_v
   ; CHECK:   [[COPY5:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE]], [[COPY5]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 4096, i32 %soffset, i32 0)
   ret void
@@ -534,7 +534,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -553,7 +553,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -574,7 +574,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -593,7 +593,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4095
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4095, i32 0)
   ret void
@@ -611,7 +611,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 4096
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 4096, i32 0)
   ret void
@@ -629,7 +629,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 16
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -648,7 +648,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sreg_32 = COPY $sgpr6
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4095, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[COPY5]], [[REG_SEQUENCE]], [[COPY6]], 4095, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4095, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4095
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -669,7 +669,7 @@ define amdgpu_ps void @raw_buffer_store__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
   ; CHECK:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4096, implicit $exec
   ; CHECK:   %14:vgpr_32, dead %15:sreg_64 = V_ADD_I32_e64 [[COPY5]], killed [[V_MOV_B32_e32_]], 0, implicit $exec
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 4096, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %14, [[REG_SEQUENCE]], [[COPY6]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 4096, align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   %voffset.add = add i32 %voffset, 4096
   call void @llvm.amdgcn.raw.buffer.store.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset.add, i32 %soffset, i32 0)
@@ -707,7 +707,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY8]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], %30, [[REG_SEQUENCE3]], [[COPY6]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -750,7 +750,7 @@ define amdgpu_ps void @raw_buffer_store__vgpr_rsrc__vgpr_val__5000_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[REG_SEQUENCE2]], [[COPY7]], implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[REG_SEQUENCE3:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[V_READFIRSTLANE_B32_]], %subreg.sub0, [[V_READFIRSTLANE_B32_1]], %subreg.sub1, [[V_READFIRSTLANE_B32_2]], %subreg.sub2, [[V_READFIRSTLANE_B32_3]], %subreg.sub3
-  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom TargetCustom7 + 5000, align 1, addrspace 4)
+  ; CHECK:   BUFFER_STORE_DWORD_OFFEN_exact [[COPY4]], [[V_MOV_B32_e32_]], [[REG_SEQUENCE3]], [[COPY5]], 904, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 4 into custom "TargetCustom7" + 5000, align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
index 44a17012237ec..d4a3f4025b378 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.load.1d.ll
@@ -18,7 +18,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY9]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -44,7 +44,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__sgpr_srsrc(<8 x i32> inreg %rsrc, i32
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY8]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY10]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -98,7 +98,7 @@ define amdgpu_ps void @load_1d_vgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 %s) {
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY8]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -160,7 +160,7 @@ define amdgpu_ps void @load_1d_sgpr_vaddr__vgpr_srsrc(<8 x i32> %rsrc, i32 inreg
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.load.1d), 15, [[COPY9]](s32), [[BUILD_VECTOR1]](<8 x s32>), 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
index c59372a8d09c7..e5d67a3f88742 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.image.sample.1d.ll
@@ -23,7 +23,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32)
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY13]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -54,7 +54,7 @@ define amdgpu_ps void @sample_1d_sgpr_vaddr__sgpr_rsrc__sgpr_samp(<8 x i32> inre
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY8]](s32), [[COPY9]](s32), [[COPY10]](s32), [[COPY11]](s32)
   ; CHECK:   [[DEF:%[0-9]+]]:sgpr(p1) = G_IMPLICIT_DEF
   ; CHECK:   [[COPY13:%[0-9]+]]:vgpr(s32) = COPY [[COPY12]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY13]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[COPY14:%[0-9]+]]:vgpr(p1) = COPY [[DEF]](p1)
   ; CHECK:   G_STORE [[INT]](<4 x s32>), [[COPY14]](p1) :: (store 16 into `<4 x float> addrspace(1)* undef`, addrspace 1)
   ; CHECK:   S_ENDPGM 0
@@ -113,7 +113,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__sgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_3:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV3]](s64), [[UV3]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_2:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_3]], [[S_AND_B64_1]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<8 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32), [[V_READFIRSTLANE_B32_4]](s32), [[V_READFIRSTLANE_B32_5]](s32), [[V_READFIRSTLANE_B32_6]](s32), [[V_READFIRSTLANE_B32_7]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR1]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_2]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -169,7 +169,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__sgpr_rsrc__vgpr_samp(<8 x i32> inre
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR2:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR]](<8 x s32>), [[BUILD_VECTOR2]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -247,7 +247,7 @@ define amdgpu_ps void @sample_1d_vgpr_vaddr__vgpr_rsrc__vgpr_samp(<8 x i32> %rsr
   ; CHECK:   [[V_CMP_EQ_U64_e64_5:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV5]](s64), [[UV5]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_4:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_5]], [[S_AND_B64_3]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR3:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_8]](s32), [[V_READFIRSTLANE_B32_9]](s32), [[V_READFIRSTLANE_B32_10]](s32), [[V_READFIRSTLANE_B32_11]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom TargetCustom8)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(<4 x s32>) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.image.sample.1d), 15, [[COPY12]](s32), [[BUILD_VECTOR2]](<8 x s32>), [[BUILD_VECTOR3]](<4 x s32>), 0, 0, 0 :: (dereferenceable load 16 from custom "TargetCustom8")
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_4]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
index 33a8e9a1284cc..4443daba2ee2d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.raw.buffer.load.ll
@@ -14,7 +14,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[COPY4:%[0-9]+]]:vgpr(s32) = COPY $vgpr0
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -34,7 +34,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__sgpr_val__sgpr_voffset__sgpr
   ; CHECK:   [[COPY5:%[0-9]+]]:sgpr(s32) = COPY $sgpr7
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY6]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.raw.buffer.load.f32(<4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -72,7 +72,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__sgpr
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -108,7 +108,7 @@ define amdgpu_ps float @raw_buffer_load__sgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %9(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY5]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -156,7 +156,7 @@ define amdgpu_ps float @raw_buffer_load__vgpr_rsrc__vgpr_val__vgpr_voffset__vgpr
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY5]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY5]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.raw.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
index 9bc81aecc8a1d..a657488278b04 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.load.ll
@@ -15,7 +15,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY5:%[0-9]+]]:vgpr(s32) = COPY $vgpr1
   ; CHECK:   [[COPY6:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -37,7 +37,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
   ; CHECK:   [[COPY7:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY7]](s32), [[COPY8]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   $vgpr0 = COPY [[INT]](s32)
   ; CHECK:   SI_RETURN_TO_EPILOG implicit $vgpr0
   %val = call float @llvm.amdgcn.struct.buffer.load.f32(<4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -76,7 +76,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[COPY6]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -113,7 +113,7 @@ define amdgpu_ps float @struct_buffer_load__sgpr_rsrc__vgpr_val__vgpr_vindex_vgp
   ; CHECK:   [[PHI1:%[0-9]+]]:vgpr(s32) = G_PHI [[DEF]](s32), %bb.1, %10(s32), %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY6]](s32), implicit $exec
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -162,7 +162,7 @@ define amdgpu_ps float @struct_buffer_load__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY6]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY6]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   [[INT:%[0-9]+]]:vgpr(s32) = G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.load), [[BUILD_VECTOR1]](<4 x s32>), [[COPY4]](s32), [[COPY5]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable load 4 from custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
index efe81eabc3497..a6ba559382f5c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankselect-amdgcn.struct.buffer.store.ll
@@ -16,7 +16,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[COPY6:%[0-9]+]]:vgpr(s32) = COPY $vgpr2
   ; CHECK:   [[COPY7:%[0-9]+]]:sgpr(s32) = COPY $sgpr6
   ; CHECK:   [[BUILD_VECTOR:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[COPY]](s32), [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -39,7 +39,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__sgpr_val__sgpr_vindex__sg
   ; CHECK:   [[COPY8:%[0-9]+]]:vgpr(s32) = COPY [[COPY4]](s32)
   ; CHECK:   [[COPY9:%[0-9]+]]:vgpr(s32) = COPY [[COPY5]](s32)
   ; CHECK:   [[COPY10:%[0-9]+]]:vgpr(s32) = COPY [[COPY6]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY8]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY9]](s32), [[COPY10]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   S_ENDPGM 0
   call void @llvm.amdgcn.struct.buffer.store.f32(float %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
   ret void
@@ -76,7 +76,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_CMP_EQ_U64_e64_1:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U64_e64 [[MV1]](s64), [[UV1]](s64), implicit $exec
   ; CHECK:   [[S_AND_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U64_e64_1]], [[V_CMP_EQ_U64_e64_]], implicit-def $scc
   ; CHECK:   [[BUILD_VECTOR1:%[0-9]+]]:sgpr(<4 x s32>) = G_BUILD_VECTOR [[V_READFIRSTLANE_B32_]](s32), [[V_READFIRSTLANE_B32_1]](s32), [[V_READFIRSTLANE_B32_2]](s32), [[V_READFIRSTLANE_B32_3]](s32)
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[COPY7]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -111,7 +111,7 @@ define amdgpu_ps void @struct_buffer_store__sgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[PHI:%[0-9]+]]:sreg_64_xexec = PHI [[DEF]], %bb.1, %14, %bb.2
   ; CHECK:   [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_]](s32), [[COPY7]](s32), implicit $exec
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[V_CMP_EQ_U32_e64_]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
@@ -158,7 +158,7 @@ define amdgpu_ps void @struct_buffer_store__vgpr_rsrc__vgpr_val__vgpr_vindex__vg
   ; CHECK:   [[V_READFIRSTLANE_B32_4:%[0-9]+]]:sreg_32_xm0(s32) = V_READFIRSTLANE_B32 [[COPY7]](s32), implicit $exec
   ; CHECK:   [[V_CMP_EQ_U32_e64_:%[0-9]+]]:sreg_64_xexec = V_CMP_EQ_U32_e64 [[V_READFIRSTLANE_B32_4]](s32), [[COPY7]](s32), implicit $exec
   ; CHECK:   [[S_AND_B64_1:%[0-9]+]]:sreg_64_xexec = S_AND_B64 [[V_CMP_EQ_U32_e64_]], [[S_AND_B64_]], implicit-def $scc
-  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom TargetCustom7, align 1, addrspace 4)
+  ; CHECK:   G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.struct.buffer.store), [[COPY4]](s32), [[BUILD_VECTOR1]](<4 x s32>), [[COPY5]](s32), [[COPY6]](s32), [[V_READFIRSTLANE_B32_4]](s32), 0 :: (dereferenceable store 4 into custom "TargetCustom7", align 1, addrspace 4)
   ; CHECK:   [[S_AND_SAVEEXEC_B64_:%[0-9]+]]:sreg_64_xexec = S_AND_SAVEEXEC_B64 killed [[S_AND_B64_1]], implicit-def $exec, implicit-def $scc, implicit $exec
   ; CHECK:   $exec = S_XOR_B64_term $exec, [[S_AND_SAVEEXEC_B64_]], implicit-def $scc
   ; CHECK:   S_CBRANCH_EXECNZ %bb.2, implicit $exec
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
index f96a13878ba6a..ce62e041aa67e 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-intrinsics-mmo-offsets.ll
@@ -10,218 +10,218 @@ define amdgpu_cs void @mmo_offsets0(<4 x i32> addrspace(6)* inreg noalias derefe
   ; GCN:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 0
   ; GCN:   [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[S_MOV_B32_]], %subreg.sub1
   ; GCN:   [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM killed [[REG_SEQUENCE]], 0, 0, 0 :: (dereferenceable invariant load 16 from %ir.arg0, addrspace 6)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 16, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 16, align 1, addrspace 4)
   ; GCN:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1, implicit $exec
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 16, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 32, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 32, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 32, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 48, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 48, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 48, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 64, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 64, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 80, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 80, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY]], %subreg.sub1
   ; GCN:   [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 96, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 96, align 1, addrspace 4)
   ; GCN:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[DEF]].sub0
   ; GCN:   [[DEF1:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[DEF1]].sub0
   ; GCN:   [[DEF2:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[DEF2]].sub0
   ; GCN:   [[DEF3:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 96, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF3]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7 + 112, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom TargetCustom7, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFSET [[V_MOV_B32_e32_1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7" + 112, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_OFFEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 0, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_F32_IDXEN [[V_MOV_B32_e32_1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 112, 0, implicit $exec :: (load store 4 on custom "TargetCustom7", addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 64
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_1]], 64, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_2:%[0-9]+]]:sreg_32 = S_MOV_B32 128
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 128, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 128, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY6:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY6]], 128, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_3:%[0-9]+]]:sreg_32 = S_MOV_B32 72
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_3]], 72, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_4:%[0-9]+]]:sreg_32 = S_MOV_B32 144
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 144, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 144, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY7:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_OFFSET [[S_LOAD_DWORDX4_IMM]], [[COPY7]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_5:%[0-9]+]]:sreg_32 = S_MOV_B32 80
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_5]], 80, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_6:%[0-9]+]]:sreg_32 = S_MOV_B32 160
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 160, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 160, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_6]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY8:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_OFFSET [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[COPY8]], 160, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[DEF4:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[DEF4]].sub0
   ; GCN:   [[S_MOV_B32_7:%[0-9]+]]:sreg_32 = S_MOV_B32 88
   ; GCN:   [[DEF5:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_7]], 88, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[DEF5]].sub0
   ; GCN:   [[S_MOV_B32_8:%[0-9]+]]:sreg_32 = S_MOV_B32 176
   ; GCN:   [[DEF6:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 176, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 176, align 1, addrspace 4)
   ; GCN:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[DEF6]].sub0
   ; GCN:   [[DEF7:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_8]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[DEF7]].sub0
   ; GCN:   [[COPY13:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF8:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_OFFSET [[REG_SEQUENCE1]], [[S_LOAD_DWORDX4_IMM]], [[COPY13]], 176, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[DEF8]].sub0
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_9:%[0-9]+]]:sreg_32 = S_MOV_B32 96
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_9]], 96, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_10:%[0-9]+]]:sreg_32 = S_MOV_B32 192
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 192, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 192, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFEN_exact killed [[BUFFER_LOAD_DWORDX4_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_10]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY15:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_OFFSET_exact killed [[BUFFER_LOAD_DWORDX4_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY15]], 192, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET1]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_11:%[0-9]+]]:sreg_32 = S_MOV_B32 104
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET2]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_11]], 104, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_12:%[0-9]+]]:sreg_32 = S_MOV_B32 208
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 208, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET3]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 208, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFEN1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_12]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY16:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_OFFSET_exact killed [[BUFFER_LOAD_FORMAT_XYZW_OFFSET4]], [[S_LOAD_DWORDX4_IMM]], [[COPY16]], 208, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY17]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_13:%[0-9]+]]:sreg_32 = S_MOV_B32 112
   ; GCN:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY18]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_13]], 112, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_14:%[0-9]+]]:sreg_32 = S_MOV_B32 224
   ; GCN:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 224, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY19]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 224, align 1, addrspace 4)
   ; GCN:   [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[S_MOV_B32_]], %subreg.sub0, [[COPY]], %subreg.sub1
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_14]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY21:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY20]], [[S_LOAD_DWORDX4_IMM]], [[COPY21]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 224, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY22]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_15:%[0-9]+]]:sreg_32 = S_MOV_B32 120
   ; GCN:   [[COPY23:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY23]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_15]], 120, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_16:%[0-9]+]]:sreg_32 = S_MOV_B32 240
   ; GCN:   [[COPY24:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7 + 240, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY24]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7" + 240, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_BOTHEN [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_16]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY25:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY26:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY25]], [[S_LOAD_DWORDX4_IMM]], [[COPY26]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7:%[0-9]+]]:vreg_128 = BUFFER_LOAD_FORMAT_XYZW_IDXEN [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 240, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY27:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY27]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[COPY28:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY28]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_2]], 128, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_17:%[0-9]+]]:sreg_32 = S_MOV_B32 256
   ; GCN:   [[COPY29:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 256, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY29]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 256, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_BOTHEN [[COPY]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_17]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY30:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY31:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY30]], [[S_LOAD_DWORDX4_IMM]], [[COPY31]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_ADD_IDXEN [[COPY]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 256, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY32:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF9:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY32]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY33:%[0-9]+]]:vgpr_32 = COPY [[DEF9]].sub0
   ; GCN:   [[S_MOV_B32_18:%[0-9]+]]:sreg_32 = S_MOV_B32 136
   ; GCN:   [[COPY34:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF10:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY34]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_18]], 136, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY35:%[0-9]+]]:vgpr_32 = COPY [[DEF10]].sub0
   ; GCN:   [[S_MOV_B32_19:%[0-9]+]]:sreg_32 = S_MOV_B32 272
   ; GCN:   [[COPY36:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[DEF11:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7 + 272, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY36]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7" + 272, align 1, addrspace 4)
   ; GCN:   [[COPY37:%[0-9]+]]:vgpr_32 = COPY [[DEF11]].sub0
   ; GCN:   [[DEF12:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_BOTHEN [[REG_SEQUENCE1]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_19]], 0, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY38:%[0-9]+]]:vgpr_32 = COPY [[DEF12]].sub0
   ; GCN:   [[COPY39:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY40:%[0-9]+]]:sreg_32 = COPY [[COPY]]
   ; GCN:   [[DEF13:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY39]], [[S_LOAD_DWORDX4_IMM]], [[COPY40]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY41:%[0-9]+]]:vgpr_32 = COPY [[DEF13]].sub0
   ; GCN:   [[DEF14:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY42:%[0-9]+]]:vgpr_32 = COPY [[DEF14]].sub0
   ; GCN:   [[DEF15:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
-  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_ATOMIC_CMPSWAP_IDXEN [[REG_SEQUENCE1]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 272, 0, implicit $exec :: (volatile dereferenceable load store 4 on custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY43:%[0-9]+]]:vgpr_32 = COPY [[DEF15]].sub0
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY44:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN2]], [[COPY44]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[COPY45:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN3]], [[COPY45]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_4]], 144, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_20:%[0-9]+]]:sreg_32 = S_MOV_B32 288
   ; GCN:   [[COPY46:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 288, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN4]], [[COPY46]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 288, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_BOTHEN_exact killed [[BUFFER_LOAD_DWORDX4_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_20]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY47:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY48:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN5]], [[COPY47]], [[S_LOAD_DWORDX4_IMM]], [[COPY48]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX4_IDXEN_exact killed [[BUFFER_LOAD_DWORDX4_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 288, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   INLINEASM &"", 1
   ; GCN:   [[COPY49:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN2]], [[COPY49]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_21:%[0-9]+]]:sreg_32 = S_MOV_B32 152
   ; GCN:   [[COPY50:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN3]], [[COPY50]], [[S_LOAD_DWORDX4_IMM]], killed [[S_MOV_B32_21]], 152, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
   ; GCN:   [[S_MOV_B32_22:%[0-9]+]]:sreg_32 = S_MOV_B32 304
   ; GCN:   [[COPY51:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7 + 304, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN4]], [[COPY51]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7" + 304, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_BOTHEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_BOTHEN]], [[REG_SEQUENCE2]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_22]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY52:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; GCN:   [[COPY53:%[0-9]+]]:sreg_32 = COPY [[COPY]]
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
-  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN5]], [[COPY52]], [[S_LOAD_DWORDX4_IMM]], [[COPY53]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN6]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_FORMAT_XYZW_IDXEN_exact killed [[BUFFER_LOAD_FORMAT_XYZW_IDXEN7]], [[COPY]], [[S_LOAD_DWORDX4_IMM]], [[S_MOV_B32_]], 304, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 16 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 bb.0:
   %tmp0 = load <4 x i32>, <4 x i32> addrspace(6)* %arg0, align 16, !invariant.load !0
diff --git a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
index 0c264251942a4..93322c7da4f86 100644
--- a/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
+++ b/llvm/test/CodeGen/AMDGPU/extract_subvector_vec4_vec3.ll
@@ -12,7 +12,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY:%[0-9]+]]:vgpr_32 = COPY [[DEF]]
   ; GCN:   [[DEF1:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   [[BUFFER_LOAD_DWORDX4_OFFEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFEN [[COPY]], [[DEF1]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub2
   ; GCN:   [[COPY2:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub1
   ; GCN:   [[COPY3:%[0-9]+]]:sgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFEN]].sub0
@@ -21,7 +21,7 @@ define amdgpu_hs void @main([0 x i8] addrspace(6)* inreg %arg) {
   ; GCN:   [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
   ; GCN:   [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[DEF2]]
   ; GCN:   [[DEF3:%[0-9]+]]:sgpr_128 = IMPLICIT_DEF
-  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom TargetCustom7, align 1, addrspace 4)
+  ; GCN:   BUFFER_STORE_DWORDX3_OFFEN_exact killed [[COPY4]], [[COPY5]], [[DEF3]], [[S_MOV_B32_]], 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable store 12 into custom "TargetCustom7", align 1, addrspace 4)
   ; GCN:   S_ENDPGM 0
 main_body:
   %tmp25 = call <4 x float> @llvm.amdgcn.raw.buffer.load.v4f32(<4 x i32> undef, i32 undef, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
index cdcf7383afc09..068e3d98f17be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.barrier.ll
@@ -27,7 +27,7 @@
 
 ; MIR-LABEL: name: gws_barrier_offset0{{$}}
 ; MIR: BUNDLE implicit{{( killed)?( renamable)?}} $vgpr0, implicit $m0, implicit $exec {
-; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom GWSResource)
+; MIR-NEXT: DS_GWS_BARRIER renamable $vgpr0, 0, -1, implicit $m0, implicit $exec :: (load 4 from custom "GWSResource")
 ; MIR-NEXT: S_WAITCNT 0
 ; MIR-NEXT: }
 define amdgpu_kernel void @gws_barrier_offset0(i32 %val) #0 {
diff --git a/llvm/unittests/CodeGen/MachineOperandTest.cpp b/llvm/unittests/CodeGen/MachineOperandTest.cpp
index faa471f2260c7..7e60fab281545 100644
--- a/llvm/unittests/CodeGen/MachineOperandTest.cpp
+++ b/llvm/unittests/CodeGen/MachineOperandTest.cpp
@@ -310,7 +310,7 @@ TEST(MachineOperandTest, PrintMetadata) {
   std::string str;
   // Print a MachineOperand containing a metadata node.
   raw_string_ostream OS(str);
-  MO.print(OS, MST, LLT{}, /*PrintDef=*/false, /*IsStandalone=*/false,
+  MO.print(OS, MST, LLT{}, /*OpIdx*/~0U, /*PrintDef=*/false, /*IsStandalone=*/false,
            /*ShouldPrintRegisterTies=*/false, 0, /*TRI=*/nullptr,
            /*IntrinsicInfo=*/nullptr);
   ASSERT_TRUE(OS.str() == "!0");

From d48ac7d54d8a096677c84cfb2928400e05b918ea Mon Sep 17 00:00:00 2001
From: QingShan Zhang <qshanz@cn.ibm.com>
Date: Thu, 9 Jan 2020 03:41:36 +0000
Subject: [PATCH 23/89] [DAGCombine] Fold the (fma -x, y, -z) to -(fma x, y, z)

This is a positive combination as long as the NEG is NOT free,
as we are reducing the number of NEG from two to one.

Differential Revision: https://reviews.llvm.org/D72312
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  9 ++++
 llvm/test/CodeGen/PowerPC/combine-fneg.ll     | 11 ++---
 llvm/test/CodeGen/PowerPC/fma-combine.ll      | 42 ++++++++-----------
 llvm/test/CodeGen/PowerPC/qpx-recipest.ll     |  4 +-
 llvm/test/CodeGen/PowerPC/recipest.ll         |  4 +-
 .../CodeGen/PowerPC/repeated-fp-divisors.ll   |  4 +-
 6 files changed, 37 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cfc4671eaa0e4..37b1b17218d02 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -12639,6 +12639,15 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
     }
   }
 
+  // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
+  // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
+  if (!TLI.isFNegFree(VT) &&
+      TLI.isNegatibleForFree(SDValue(N, 0), DAG, LegalOperations,
+                             ForCodeSize) == 2)
+    return DAG.getNode(ISD::FNEG, DL, VT,
+                       TLI.getNegatedExpression(SDValue(N, 0), DAG,
+                                                LegalOperations, ForCodeSize),
+                       Flags);
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/PowerPC/combine-fneg.ll b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
index 1d85f4f9680ae..14bace2f95f8b 100644
--- a/llvm/test/CodeGen/PowerPC/combine-fneg.ll
+++ b/llvm/test/CodeGen/PowerPC/combine-fneg.ll
@@ -5,19 +5,16 @@
 define <4 x double> @fneg_fdiv_splat(double %a0, <4 x double> %a1) {
 ; CHECK-LABEL: fneg_fdiv_splat:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; CHECK-NEXT:    # kill: def $f1 killed $f1 def $vsl1
 ; CHECK-NEXT:    xxspltd 0, 1, 0
-; CHECK-NEXT:    addis 3, 2, .LCPI0_0@toc@ha
 ; CHECK-NEXT:    addi 3, 3, .LCPI0_0@toc@l
 ; CHECK-NEXT:    lxvd2x 1, 0, 3
-; CHECK-NEXT:    addis 3, 2, .LCPI0_1@toc@ha
 ; CHECK-NEXT:    xvredp 2, 0
-; CHECK-NEXT:    addi 3, 3, .LCPI0_1@toc@l
-; CHECK-NEXT:    xxswapd 1, 1
-; CHECK-NEXT:    xvnmsubadp 1, 0, 2
-; CHECK-NEXT:    xvmaddadp 2, 2, 1
-; CHECK-NEXT:    lxvd2x 1, 0, 3
 ; CHECK-NEXT:    xxswapd 1, 1
+; CHECK-NEXT:    xxlor 3, 1, 1
+; CHECK-NEXT:    xvmaddadp 3, 0, 2
+; CHECK-NEXT:    xvnmsubadp 2, 2, 3
 ; CHECK-NEXT:    xvmaddadp 1, 0, 2
 ; CHECK-NEXT:    xvmsubadp 2, 2, 1
 ; CHECK-NEXT:    xvmuldp 34, 34, 2
diff --git a/llvm/test/CodeGen/PowerPC/fma-combine.ll b/llvm/test/CodeGen/PowerPC/fma-combine.ll
index 5baf663481d70..88da295201fea 100644
--- a/llvm/test/CodeGen/PowerPC/fma-combine.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-combine.ll
@@ -8,14 +8,12 @@
 define double @fma_combine1(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine1:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine1:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    fmsub 1, 0, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 1, 3, 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine1:
@@ -34,14 +32,12 @@ entry:
 define double @fma_combine2(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine2:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
-; CHECK-FAST-NEXT:    xsmsubadp 1, 2, 0
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 2, 3
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine2:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    fmsub 1, 2, 0, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 1, 2, 3, 1
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine2:
@@ -62,25 +58,25 @@ entry:
 define double @fma_combine_two_uses(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine_two_uses:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
+; CHECK-FAST-NEXT:    xsnegdp 0, 1
 ; CHECK-FAST-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NEXT:    addis 4, 2, z@toc@ha
-; CHECK-FAST-NEXT:    xsnegdp 3, 1
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
-; CHECK-FAST-NEXT:    stfd 0, z@toc@l(4)
-; CHECK-FAST-NEXT:    stfd 3, v@toc@l(3)
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
+; CHECK-FAST-NEXT:    xsnegdp 2, 3
+; CHECK-FAST-NEXT:    stfd 0, v@toc@l(3)
+; CHECK-FAST-NEXT:    stfd 2, z@toc@l(4)
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine_two_uses:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 3, 3
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 0, 3, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NOVSX-NEXT:    addis 4, 2, z@toc@ha
-; CHECK-FAST-NOVSX-NEXT:    fmsub 0, 3, 2, 1
-; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
-; CHECK-FAST-NOVSX-NEXT:    stfd 3, z@toc@l(4)
+; CHECK-FAST-NOVSX-NEXT:    fneg 3, 3
 ; CHECK-FAST-NOVSX-NEXT:    fmr 1, 0
 ; CHECK-FAST-NOVSX-NEXT:    stfd 2, v@toc@l(3)
+; CHECK-FAST-NOVSX-NEXT:    stfd 3, z@toc@l(4)
 ; CHECK-FAST-NOVSX-NEXT:    blr
 ;
 ; CHECK-LABEL: fma_combine_two_uses:
@@ -108,19 +104,17 @@ entry:
 define double @fma_combine_one_use(double %a, double %b, double %c) {
 ; CHECK-FAST-LABEL: fma_combine_one_use:
 ; CHECK-FAST:       # %bb.0: # %entry
-; CHECK-FAST-NEXT:    xsnegdp 0, 3
+; CHECK-FAST-NEXT:    xsnegdp 0, 1
 ; CHECK-FAST-NEXT:    addis 3, 2, v@toc@ha
-; CHECK-FAST-NEXT:    xsnegdp 3, 1
-; CHECK-FAST-NEXT:    xsmsubadp 1, 0, 2
-; CHECK-FAST-NEXT:    stfd 3, v@toc@l(3)
+; CHECK-FAST-NEXT:    xsnmaddadp 1, 3, 2
+; CHECK-FAST-NEXT:    stfd 0, v@toc@l(3)
 ; CHECK-FAST-NEXT:    blr
 ;
 ; CHECK-FAST-NOVSX-LABEL: fma_combine_one_use:
 ; CHECK-FAST-NOVSX:       # %bb.0: # %entry
-; CHECK-FAST-NOVSX-NEXT:    fneg 0, 3
-; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
-; CHECK-FAST-NOVSX-NEXT:    fmsub 0, 0, 2, 1
+; CHECK-FAST-NOVSX-NEXT:    fnmadd 0, 3, 2, 1
 ; CHECK-FAST-NOVSX-NEXT:    fneg 2, 1
+; CHECK-FAST-NOVSX-NEXT:    addis 3, 2, v@toc@ha
 ; CHECK-FAST-NOVSX-NEXT:    fmr 1, 0
 ; CHECK-FAST-NOVSX-NEXT:    stfd 2, v@toc@l(3)
 ; CHECK-FAST-NOVSX-NEXT:    blr
diff --git a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
index 3bfd92a2e5b36..246bec1918ef2 100644
--- a/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/qpx-recipest.ll
@@ -229,8 +229,8 @@ define <4 x double> @foo2_fmf(<4 x double> %a, <4 x double> %b) nounwind {
 ; CHECK-NEXT:    qvfre 3, 2
 ; CHECK-NEXT:    addi 3, 3, .LCPI8_0@toc@l
 ; CHECK-NEXT:    qvlfdx 0, 0, 3
-; CHECK-NEXT:    qvfnmsub 0, 2, 3, 0
-; CHECK-NEXT:    qvfmadd 0, 3, 0, 3
+; CHECK-NEXT:    qvfmadd 0, 2, 3, 0
+; CHECK-NEXT:    qvfnmsub 0, 3, 0, 3
 ; CHECK-NEXT:    qvfmul 3, 1, 0
 ; CHECK-NEXT:    qvfnmsub 1, 2, 3, 1
 ; CHECK-NEXT:    qvfmadd 1, 0, 1, 3
diff --git a/llvm/test/CodeGen/PowerPC/recipest.ll b/llvm/test/CodeGen/PowerPC/recipest.ll
index bc33617662e47..a0afb4b6e12dc 100644
--- a/llvm/test/CodeGen/PowerPC/recipest.ll
+++ b/llvm/test/CodeGen/PowerPC/recipest.ll
@@ -194,8 +194,8 @@ define <4 x float> @hoo_safe(<4 x float> %a, <4 x float> %b) nounwind {
 define double @foo2_fmf(double %a, double %b) nounwind {
 ; CHECK: @foo2_fmf
 ; CHECK-DAG: fre
-; CHECK-DAG: fnmsub
-; CHECK: fmadd
+; CHECK-DAG: fmadd
+; CHECK: fnmsub
 ; CHECK-NEXT: fmul
 ; CHECK-NEXT: fnmsub
 ; CHECK-NEXT: fmadd
diff --git a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
index 9ab320cd1eacf..2cdf832838a8d 100644
--- a/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
+++ b/llvm/test/CodeGen/PowerPC/repeated-fp-divisors.ll
@@ -13,9 +13,9 @@ define <4 x float> @repeated_fp_divisor(float %a, <4 x float> %b) {
 ; CHECK-NEXT:    lvx 4, 0, 3
 ; CHECK-NEXT:    xxspltw 0, 0, 0
 ; CHECK-NEXT:    xvresp 1, 0
-; CHECK-NEXT:    xvnmsubasp 35, 0, 1
+; CHECK-NEXT:    xvmaddasp 35, 0, 1
 ; CHECK-NEXT:    xvmulsp 0, 34, 36
-; CHECK-NEXT:    xvmaddasp 1, 1, 35
+; CHECK-NEXT:    xvnmsubasp 1, 1, 35
 ; CHECK-NEXT:    xvmulsp 34, 0, 1
 ; CHECK-NEXT:    blr
   %ins = insertelement <4 x float> undef, float %a, i32 0

From 51bdd98b8a52d07004bcaddff26caf376a1c32bf Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 8 Jan 2020 20:56:11 -0800
Subject: [PATCH 24/89] [lldb/SWIG] Refactor extensions to be non
 Python-specific (3/3)

The current SWIG extensions for the string conversion operator is Python
specific because it uses the PythonObjects. This means that the code
cannot be reused for other SWIG supported languages such as Lua.

This reimplements the extensions in a more generic way that can be
reused. It uses a SWIG macro to reduce code duplication.

Differential revision: https://reviews.llvm.org/D72377
---
 lldb/scripts/Python/python-extensions.swig   | 257 +------------------
 lldb/scripts/interface/SBProcess.i           |   2 +
 lldb/scripts/interface/SBSection.i           |   2 +
 lldb/scripts/interface/SBSymbol.i            |   2 +
 lldb/scripts/interface/SBSymbolContext.i     |   1 +
 lldb/scripts/interface/SBSymbolContextList.i |   2 +
 lldb/scripts/interface/SBThread.i            |   2 +
 lldb/scripts/interface/SBType.i              |   5 +
 lldb/scripts/interface/SBTypeCategory.i      |   2 +
 lldb/scripts/interface/SBTypeEnumMember.i    |   1 +
 lldb/scripts/interface/SBTypeFilter.i        |   2 +
 lldb/scripts/interface/SBTypeFormat.i        |   2 +
 lldb/scripts/interface/SBTypeNameSpecifier.i |   2 +
 lldb/scripts/interface/SBTypeSummary.i       |   2 +
 lldb/scripts/interface/SBTypeSynthetic.i     |   2 +
 lldb/scripts/interface/SBValue.i             |   2 +
 lldb/scripts/interface/SBValueList.i         |  23 ++
 lldb/scripts/interface/SBWatchpoint.i        |   1 +
 18 files changed, 59 insertions(+), 253 deletions(-)

diff --git a/lldb/scripts/Python/python-extensions.swig b/lldb/scripts/Python/python-extensions.swig
index 36dac19644bd0..0b23fdd400068 100644
--- a/lldb/scripts/Python/python-extensions.swig
+++ b/lldb/scripts/Python/python-extensions.swig
@@ -125,32 +125,7 @@
     %}
 }
 
-%extend lldb::SBProcess {
-        %nothreadallow;
-        PyObject *lldb::SBProcess::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
 %extend lldb::SBSection {
-        %nothreadallow;
-        PyObject *lldb::SBSection::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -180,17 +155,6 @@
         {}
 }
 %extend lldb::SBSymbol {
-        %nothreadallow;
-        PyObject *lldb::SBSymbol::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -205,32 +169,6 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
-%extend lldb::SBSymbolContext {
-        %nothreadallow;
-        PyObject *lldb::SBSymbolContext::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBSymbolContextList {
-        %nothreadallow;
-        PyObject *lldb::SBSymbolContextList::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
 
 %extend lldb::SBTarget {
     %pythoncode %{
@@ -248,44 +186,7 @@
     %}
 }
 
-%extend lldb::SBType {
-        %nothreadallow;
-        PyObject *lldb::SBType::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeCategory {
-        %nothreadallow;
-        PyObject *lldb::SBTypeCategory::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
 %extend lldb::SBTypeFilter {
-        %nothreadallow;
-        PyObject *lldb::SBTypeFilter::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -300,70 +201,8 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
-%extend lldb::SBTypeFormat {
-        %nothreadallow;
-        PyObject *lldb::SBTypeFormat::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeMember {
-        %nothreadallow;
-        PyObject *lldb::SBTypeMember::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeMemberFunction {
-        %nothreadallow;
-        PyObject *lldb::SBTypeMemberFunction::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBTypeEnumMember {
-        %nothreadallow;
-        PyObject *lldb::SBTypeEnumMember::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
+
 %extend lldb::SBTypeNameSpecifier {
-        %nothreadallow;
-        PyObject *lldb::SBTypeNameSpecifier::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -378,18 +217,8 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
+
 %extend lldb::SBTypeSummary {
-        %nothreadallow;
-        PyObject *lldb::SBTypeSummary::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -404,18 +233,8 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
+
 %extend lldb::SBTypeSynthetic {
-        %nothreadallow;
-        PyObject *lldb::SBTypeSynthetic::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelBrief);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -430,18 +249,8 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
+
 %extend lldb::SBThread {
-        %nothreadallow;
-        PyObject *lldb::SBThread::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
     %pythoncode %{
         def __eq__(self, rhs):
             if not isinstance(rhs, type(self)):
@@ -456,64 +265,6 @@
             return getattr(_lldb,self.__class__.__name__+"___ne__")(self, rhs)
     %}
 }
-%extend lldb::SBValue {
-        %nothreadallow;
-        PyObject *lldb::SBValue::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBValueList {
-        %nothreadallow;
-        PyObject *lldb::SBValueList::__str__ (){
-                lldb::SBStream description;
-                const size_t n = $self->GetSize();
-                if (n)
-                {
-                    for (size_t i=0; i<n; ++i)
-                        $self->GetValueAtIndex(i).GetDescription(description);
-                }
-                else
-                {
-                    description.Printf("<empty> lldb.SBValueList()");
-                }
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-%extend lldb::SBWatchpoint {
-        %nothreadallow;
-        PyObject *lldb::SBWatchpoint::__str__ (){
-                lldb::SBStream description;
-                $self->GetDescription (description, lldb::eDescriptionLevelVerbose);
-                const char *desc = description.GetData();
-                size_t desc_len = description.GetSize();
-                if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
-                    --desc_len;
-                return PythonString(llvm::StringRef(desc, desc_len)).release();
-        }
-        %clearnothreadallow;
-}
-
-
-// %extend lldb::SBDebugger {
-//         // FIXME: We can't get the callback and baton
-//         PyObject *lldb::SBDebugger (){
-//             // Only call Py_XDECREF if we have a Python object (or NULL)
-//             if (LLDBSwigPythonCallPythonLogOutputCallback == $self->GetLogOutPutCallback())
-//                 Py_XDECREF($self->GetCallbackBaton());
-//         }
-// }
 
 %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBProcess.i b/lldb/scripts/interface/SBProcess.i
index c5ebc24686155..ac6a265faec9f 100644
--- a/lldb/scripts/interface/SBProcess.i
+++ b/lldb/scripts/interface/SBProcess.i
@@ -417,6 +417,8 @@ public:
     lldb::SBProcessInfo
     GetProcessInfo();
 
+    STRING_EXTENSION(SBProcess)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_is_alive__(self):
diff --git a/lldb/scripts/interface/SBSection.i b/lldb/scripts/interface/SBSection.i
index c1a84acc4f144..3d1c900917fd8 100644
--- a/lldb/scripts/interface/SBSection.i
+++ b/lldb/scripts/interface/SBSection.i
@@ -114,6 +114,8 @@ public:
     bool
     operator != (const lldb::SBSection &rhs);
 
+    STRING_EXTENSION(SBSection)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBSymbol.i b/lldb/scripts/interface/SBSymbol.i
index e5880e66d300f..4e17ab5af0fd6 100644
--- a/lldb/scripts/interface/SBSymbol.i
+++ b/lldb/scripts/interface/SBSymbol.i
@@ -72,6 +72,8 @@ public:
     bool
     operator != (const lldb::SBSymbol &rhs) const;
 
+    STRING_EXTENSION(SBSymbol)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def get_instructions_from_current_target (self):
diff --git a/lldb/scripts/interface/SBSymbolContext.i b/lldb/scripts/interface/SBSymbolContext.i
index a6aa4d78bfe39..b6b336516c949 100644
--- a/lldb/scripts/interface/SBSymbolContext.i
+++ b/lldb/scripts/interface/SBSymbolContext.i
@@ -81,6 +81,7 @@ public:
     bool
     GetDescription (lldb::SBStream &description);
 
+    STRING_EXTENSION(SBSymbolContext)
 
 #ifdef SWIGPYTHON
     %pythoncode %{
diff --git a/lldb/scripts/interface/SBSymbolContextList.i b/lldb/scripts/interface/SBSymbolContextList.i
index 54adc659fa212..f5adcfcebfb56 100644
--- a/lldb/scripts/interface/SBSymbolContextList.i
+++ b/lldb/scripts/interface/SBSymbolContextList.i
@@ -60,6 +60,8 @@ public:
     void
     Clear();
 
+    STRING_EXTENSION(SBSymbolContextList)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBThread.i b/lldb/scripts/interface/SBThread.i
index c1c045487fc1d..95b15b182ec26 100644
--- a/lldb/scripts/interface/SBThread.i
+++ b/lldb/scripts/interface/SBThread.i
@@ -402,6 +402,8 @@ public:
     bool
     SafeToCallFunctions ();
 
+    STRING_EXTENSION(SBThread)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBType.i b/lldb/scripts/interface/SBType.i
index d9da9e39b9560..3cd82452084b4 100644
--- a/lldb/scripts/interface/SBType.i
+++ b/lldb/scripts/interface/SBType.i
@@ -43,6 +43,8 @@ public:
     uint32_t
     GetBitfieldSizeInBits();
 
+    STRING_EXTENSION_LEVEL(SBTypeMember, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         name = property(GetName, None, doc='''A read only property that returns the name for this member as a string.''')
@@ -100,6 +102,7 @@ public:
     GetDescription (lldb::SBStream &description,
                     lldb::DescriptionLevel description_level);
 
+    STRING_EXTENSION_LEVEL(SBTypeMemberFunction, lldb::eDescriptionLevelBrief)
 protected:
     lldb::TypeMemberFunctionImplSP m_opaque_sp;
 };
@@ -314,6 +317,8 @@ public:
 
     bool operator!=(lldb::SBType &rhs);
 
+    STRING_EXTENSION_LEVEL(SBType, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def template_arg_array(self):
diff --git a/lldb/scripts/interface/SBTypeCategory.i b/lldb/scripts/interface/SBTypeCategory.i
index 43fe9faf70f52..b762bf8a95a36 100644
--- a/lldb/scripts/interface/SBTypeCategory.i
+++ b/lldb/scripts/interface/SBTypeCategory.i
@@ -124,6 +124,8 @@ namespace lldb {
         bool
         DeleteTypeSynthetic (lldb::SBTypeNameSpecifier);
 
+        STRING_EXTENSION_LEVEL(SBTypeCategory, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
 
diff --git a/lldb/scripts/interface/SBTypeEnumMember.i b/lldb/scripts/interface/SBTypeEnumMember.i
index b2d8617117823..006bdeaa8cee1 100644
--- a/lldb/scripts/interface/SBTypeEnumMember.i
+++ b/lldb/scripts/interface/SBTypeEnumMember.i
@@ -43,6 +43,7 @@ public:
     GetDescription (lldb::SBStream &description,
                     lldb::DescriptionLevel description_level);
 
+    STRING_EXTENSION_LEVEL(SBTypeEnumMember, lldb::eDescriptionLevelBrief)
 #ifdef SWIGPYTHON
     %pythoncode %{
         name = property(GetName, None, doc='''A read only property that returns the name for this enum member as a string.''')
diff --git a/lldb/scripts/interface/SBTypeFilter.i b/lldb/scripts/interface/SBTypeFilter.i
index 3759e0a23d418..c1d282c6d4fbe 100644
--- a/lldb/scripts/interface/SBTypeFilter.i
+++ b/lldb/scripts/interface/SBTypeFilter.i
@@ -61,6 +61,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeFilter &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeFilter, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBTypeFormat.i b/lldb/scripts/interface/SBTypeFormat.i
index 5efd135b73261..765a2a7bb99dc 100644
--- a/lldb/scripts/interface/SBTypeFormat.i
+++ b/lldb/scripts/interface/SBTypeFormat.i
@@ -61,6 +61,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeFormat &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeFormat, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             format = property(GetFormat, SetFormat)
diff --git a/lldb/scripts/interface/SBTypeNameSpecifier.i b/lldb/scripts/interface/SBTypeNameSpecifier.i
index bb16e86b0bc50..772f7c174093f 100644
--- a/lldb/scripts/interface/SBTypeNameSpecifier.i
+++ b/lldb/scripts/interface/SBTypeNameSpecifier.i
@@ -53,6 +53,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeNameSpecifier &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeNameSpecifier, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             name = property(GetName)
diff --git a/lldb/scripts/interface/SBTypeSummary.i b/lldb/scripts/interface/SBTypeSummary.i
index 225a404cf73c4..adcc79b5a6ee8 100644
--- a/lldb/scripts/interface/SBTypeSummary.i
+++ b/lldb/scripts/interface/SBTypeSummary.i
@@ -101,6 +101,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeSummary &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeSummary, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBTypeSynthetic.i b/lldb/scripts/interface/SBTypeSynthetic.i
index d9d75e4c9efae..f57139ebf9f17 100644
--- a/lldb/scripts/interface/SBTypeSynthetic.i
+++ b/lldb/scripts/interface/SBTypeSynthetic.i
@@ -63,6 +63,8 @@ namespace lldb {
         bool
         operator != (lldb::SBTypeSynthetic &rhs);
 
+        STRING_EXTENSION_LEVEL(SBTypeSynthetic, lldb::eDescriptionLevelBrief)
+
 #ifdef SWIGPYTHON
         %pythoncode %{
             options = property(GetOptions, SetOptions)
diff --git a/lldb/scripts/interface/SBValue.i b/lldb/scripts/interface/SBValue.i
index 8647854e89c15..fb899805c3951 100644
--- a/lldb/scripts/interface/SBValue.i
+++ b/lldb/scripts/interface/SBValue.i
@@ -440,6 +440,8 @@ public:
                        const SBExpressionOptions &options,
                        const char *name) const;
 
+    STRING_EXTENSION(SBValue)
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __get_dynamic__ (self):
diff --git a/lldb/scripts/interface/SBValueList.i b/lldb/scripts/interface/SBValueList.i
index 56ef19054e4c5..17ba2056f0c23 100644
--- a/lldb/scripts/interface/SBValueList.i
+++ b/lldb/scripts/interface/SBValueList.i
@@ -101,6 +101,29 @@ public:
     lldb::SBValue
     GetFirstValueByName (const char* name) const;
 
+    %extend {
+       %nothreadallow;
+       std::string lldb::SBValueList::__str__ (){
+           lldb::SBStream description;
+           const size_t n = $self->GetSize();
+           if (n)
+           {
+               for (size_t i=0; i<n; ++i)
+                   $self->GetValueAtIndex(i).GetDescription(description);
+           }
+           else
+           {
+               description.Printf("<empty> lldb.SBValueList()");
+           }
+           const char *desc = description.GetData();
+           size_t desc_len = description.GetSize();
+           if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r'))
+               --desc_len;
+           return std::string(desc, desc_len);
+       }
+       %clearnothreadallow;
+    }
+
 #ifdef SWIGPYTHON
     %pythoncode %{
         def __iter__(self):
diff --git a/lldb/scripts/interface/SBWatchpoint.i b/lldb/scripts/interface/SBWatchpoint.i
index e11c4f213ca2e..cb0bc5f9859ac 100644
--- a/lldb/scripts/interface/SBWatchpoint.i
+++ b/lldb/scripts/interface/SBWatchpoint.i
@@ -90,6 +90,7 @@ public:
     static lldb::SBWatchpoint
     GetWatchpointFromEvent (const lldb::SBEvent& event);
 
+    STRING_EXTENSION_LEVEL(SBWatchpoint, lldb::eDescriptionLevelVerbose)
 };
 
 } // namespace lldb

From 4ebb589629b0d3de0827cab179338836ebb3a8b6 Mon Sep 17 00:00:00 2001
From: Zheng Chen <czhengsz@cn.ibm.com>
Date: Thu, 9 Jan 2020 01:24:22 -0500
Subject: [PATCH 25/89] [SCEV] [NFC] add testcase for constant range for
 addrecexpr with nsw flag

---
 .../Analysis/ScalarEvolution/range_nw_flag.ll | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll b/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
index 65ecd312e0cbb..4a520056d7657 100644
--- a/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
+++ b/llvm/test/Analysis/ScalarEvolution/range_nw_flag.ll
@@ -39,3 +39,22 @@ exit:
   ret void
 }
 
+; CHECK-LABEL: @test-addrec-nsw
+; CHECK: -->  {(-1 + (-10 smin %offset))<nsw>,+,-1}<nsw><%loop> U: [-2147483648,1) S: [-2147483648,1)
+define void @test-addrec-nsw(float* %input, i32 %offset, i32 %numIterations) {
+entry:
+  %cmp = icmp slt i32 %offset, -10 
+  %max = select i1 %cmp, i32 %offset, i32 -10 
+  br label %loop
+loop:
+  %i = phi i32 [ %nexti, %loop ], [ 0, %entry ]
+  %nexti = add nsw i32 %i, -1
+  %index32 = add nsw i32 %nexti, %max
+  %ptr = getelementptr inbounds float, float* %input, i32 %index32
+  %f = load float, float* %ptr, align 4
+  %exitcond = icmp eq i32 %nexti, %numIterations
+  br i1 %exitcond, label %exit, label %loop
+
+exit:
+  ret void
+}

From 24b326cc610dfdccdd50bc78505ec228d96c8e7a Mon Sep 17 00:00:00 2001
From: Ehud Katz <ehudkatz@gmail.com>
Date: Thu, 9 Jan 2020 09:42:32 +0200
Subject: [PATCH 26/89] [APFloat] Fix checked error assert failures

`APFLoat::convertFromString` returns `Expected` result, which must be
"checked" if the LLVM_ENABLE_ABI_BREAKING_CHECKS preprocessor flag is
set.
To mark an `Expected` result as "checked" we must consume the `Error`
within.
In many cases, we are only interested in knowing if an error occured,
without the need to examine the error info. This is achieved, easily,
with the `errorToBool()` API.
---
 .../clang-tidy/readability/MagicNumbersCheck.cpp   | 14 ++++++++------
 clang/lib/Lex/LiteralSupport.cpp                   |  8 +++-----
 llvm/include/llvm/ADT/StringRef.h                  |  3 ++-
 llvm/lib/MC/MCParser/AsmParser.cpp                 |  4 +++-
 llvm/lib/Support/APFloat.cpp                       |  5 ++---
 llvm/lib/Support/StringRef.cpp                     |  8 +++-----
 .../Target/AArch64/AsmParser/AArch64AsmParser.cpp  |  4 ++--
 .../Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp    |  2 +-
 llvm/unittests/ADT/APFloatTest.cpp                 | 11 +++++++----
 9 files changed, 31 insertions(+), 28 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
index 231e565f27e52..86443a155069e 100644
--- a/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
+++ b/clang-tools-extra/clang-tidy/readability/MagicNumbersCheck.cpp
@@ -86,15 +86,17 @@ MagicNumbersCheck::MagicNumbersCheck(StringRef Name, ClangTidyContext *Context)
     IgnoredDoublePointValues.reserve(IgnoredFloatingPointValuesInput.size());
     for (const auto &InputValue : IgnoredFloatingPointValuesInput) {
       llvm::APFloat FloatValue(llvm::APFloat::IEEEsingle());
-      if (!FloatValue.convertFromString(InputValue, DefaultRoundingMode)) {
-        assert(false && "Invalid floating point representation");
-      }
+      auto StatusOrErr =
+          FloatValue.convertFromString(InputValue, DefaultRoundingMode);
+      assert(StatusOrErr && "Invalid floating point representation");
+      consumeError(StatusOrErr.takeError());
       IgnoredFloatingPointValues.push_back(FloatValue.convertToFloat());
 
       llvm::APFloat DoubleValue(llvm::APFloat::IEEEdouble());
-      if (!DoubleValue.convertFromString(InputValue, DefaultRoundingMode)) {
-        assert(false && "Invalid floating point representation");
-      }
+      StatusOrErr =
+          DoubleValue.convertFromString(InputValue, DefaultRoundingMode);
+      assert(StatusOrErr && "Invalid floating point representation");
+      consumeError(StatusOrErr.takeError());
       IgnoredDoublePointValues.push_back(DoubleValue.convertToDouble());
     }
     llvm::sort(IgnoredFloatingPointValues.begin(),
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 5881852b1424a..9a852141c6eea 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1053,11 +1053,9 @@ NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
 
   auto StatusOrErr =
       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
-  if (!StatusOrErr) {
-    assert(false && "Invalid floating point representation");
-    return APFloat::opInvalidOp;
-  }
-  return *StatusOrErr;
+  assert(StatusOrErr && "Invalid floating point representation");
+  return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
+                                               : APFloat::opInvalidOp;
 }
 
 static inline bool IsExponentPart(char c) {
diff --git a/llvm/include/llvm/ADT/StringRef.h b/llvm/include/llvm/ADT/StringRef.h
index e87a08f7effff..9bfaaccd953e2 100644
--- a/llvm/include/llvm/ADT/StringRef.h
+++ b/llvm/include/llvm/ADT/StringRef.h
@@ -566,7 +566,8 @@ namespace llvm {
     ///
     /// If \p AllowInexact is false, the function will fail if the string
     /// cannot be represented exactly.  Otherwise, the function only fails
-    /// in case of an overflow or underflow.
+    /// in case of an overflow or underflow, or an invalid floating point
+    /// representation.
     bool getAsDouble(double &Result, bool AllowInexact = true) const;
 
     /// @}
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 0c4eb953aa4e6..dc8132b627a66 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -3130,7 +3130,9 @@ bool AsmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
       Value = APFloat::getNaN(Semantics, false, ~0);
     else
       return TokError("invalid floating point literal");
-  } else if (!Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven))
+  } else if (errorToBool(
+                 Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven)
+                     .takeError()))
     return TokError("invalid floating point literal");
   if (IsNeg)
     Value.changeSign();
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index f8a217d3535de..050c37baefb87 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -4518,9 +4518,8 @@ hash_code hash_value(const APFloat &Arg) {
 APFloat::APFloat(const fltSemantics &Semantics, StringRef S)
     : APFloat(Semantics) {
   auto StatusOrErr = convertFromString(S, rmNearestTiesToEven);
-  if (!StatusOrErr) {
-    assert(false && "Invalid floating point representation");
-  }
+  assert(StatusOrErr && "Invalid floating point representation");
+  consumeError(StatusOrErr.takeError());
 }
 
 APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index b5db172cc1a3a..104482de4ad70 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -588,13 +588,11 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
 
 bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
   APFloat F(0.0);
-  auto ErrOrStatus = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
-  if (!ErrOrStatus) {
-    assert(false && "Invalid floating point representation");
+  auto StatusOrErr = F.convertFromString(*this, APFloat::rmNearestTiesToEven);
+  if (errorToBool(StatusOrErr.takeError()))
     return true;
-  }
 
-  APFloat::opStatus Status = *ErrOrStatus;
+  APFloat::opStatus Status = *StatusOrErr;
   if (Status != APFloat::opOK) {
     if (!AllowInexact || !(Status & APFloat::opInexact))
       return true;
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index bd48e5d846af3..70c9db13f139d 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -1223,7 +1223,7 @@ class AArch64Operand : public MCParsedAsmOperand {
       APFloat RealVal(APFloat::IEEEdouble());
       auto StatusOrErr =
           RealVal.convertFromString(Desc->Repr, APFloat::rmTowardZero);
-      if (!StatusOrErr || *StatusOrErr != APFloat::opOK)
+      if (errorToBool(StatusOrErr.takeError()) || *StatusOrErr != APFloat::opOK)
         llvm_unreachable("FP immediate is not exact");
 
       if (getFPImm().bitwiseIsEqual(RealVal))
@@ -2580,7 +2580,7 @@ AArch64AsmParser::tryParseFPImm(OperandVector &Operands) {
     APFloat RealVal(APFloat::IEEEdouble());
     auto StatusOrErr =
         RealVal.convertFromString(Tok.getString(), APFloat::rmTowardZero);
-    if (!StatusOrErr) {
+    if (errorToBool(StatusOrErr.takeError())) {
       TokError("invalid floating point representation");
       return MatchOperand_ParseFail;
     }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 326df6bc8fb2b..d5834826fcd8b 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -2363,7 +2363,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands, bool HasSP3AbsModifier) {
 
     APFloat RealVal(APFloat::IEEEdouble());
     auto roundMode = APFloat::rmNearestTiesToEven;
-    if (!RealVal.convertFromString(Num, roundMode)) {
+    if (errorToBool(RealVal.convertFromString(Num, roundMode).takeError())) {
       return MatchOperand_ParseFail;
     }
     if (Negate)
diff --git a/llvm/unittests/ADT/APFloatTest.cpp b/llvm/unittests/ADT/APFloatTest.cpp
index adbf1b3b8c608..65b831c96e8f6 100644
--- a/llvm/unittests/ADT/APFloatTest.cpp
+++ b/llvm/unittests/ADT/APFloatTest.cpp
@@ -22,15 +22,18 @@ using namespace llvm;
 
 static std::string convertToErrorFromString(StringRef Str) {
   llvm::APFloat F(0.0);
-  auto ErrOrStatus =
+  auto StatusOrErr =
       F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven);
-  EXPECT_TRUE(!ErrOrStatus);
-  return toString(ErrOrStatus.takeError());
+  EXPECT_TRUE(!StatusOrErr);
+  return toString(StatusOrErr.takeError());
 }
 
 static double convertToDoubleFromString(StringRef Str) {
   llvm::APFloat F(0.0);
-  EXPECT_FALSE(!F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven));
+  auto StatusOrErr =
+      F.convertFromString(Str, llvm::APFloat::rmNearestTiesToEven);
+  EXPECT_FALSE(!StatusOrErr);
+  consumeError(StatusOrErr.takeError());
   return F.convertToDouble();
 }
 

From 65678d938431c90408afa8d255cbed3d8ed8273f Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Thu, 9 Jan 2020 02:58:21 -0500
Subject: [PATCH 27/89] [mlir][VectorOps] Implement strided_slice conversion

Summary:
This diff implements the progressive lowering of strided_slice to either:
  1. extractelement + insertelement for the 1-D case
  2. extract + optional strided_slice + insert for the n-D case.

This combines properly with the other conversion patterns to lower all the way to LLVM.

Appropriate tests are added.

Reviewers: ftynse, rriddle, AlexEichenberger, andydavis1, tetuante

Reviewed By: andydavis1

Subscribers: merge_guards_bot, mehdi_amini, jpienaar, burmako, shauheen, antiagainst, arpith-jacob, mgester, lucyrfox, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72310
---
 mlir/include/mlir/IR/Attributes.h             |  19 ++++
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 101 +++++++++++++++++-
 .../VectorToLLVM/vector-to-llvm.mlir          |  61 +++++++++++
 3 files changed, 178 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/IR/Attributes.h b/mlir/include/mlir/IR/Attributes.h
index b8398580f61c9..64b8063bdcb58 100644
--- a/mlir/include/mlir/IR/Attributes.h
+++ b/mlir/include/mlir/IR/Attributes.h
@@ -215,6 +215,25 @@ class ArrayAttr : public Attribute::AttrBase<ArrayAttr, Attribute,
   static bool kindof(unsigned kind) {
     return kind == StandardAttributes::Array;
   }
+
+private:
+  /// Class for underlying value iterator support.
+  template <typename AttrTy>
+  class attr_value_iterator final
+      : public llvm::mapped_iterator<iterator, AttrTy (*)(Attribute)> {
+  public:
+    explicit attr_value_iterator(iterator it)
+        : llvm::mapped_iterator<iterator, AttrTy (*)(Attribute)>(
+              it, [](Attribute attr) { return attr.cast<AttrTy>(); }) {}
+    AttrTy operator*() { return (*this->I).template cast<AttrTy>(); }
+  };
+
+public:
+  template <typename AttrTy>
+  llvm::iterator_range<attr_value_iterator<AttrTy>> getAsRange() {
+    return llvm::make_range(attr_value_iterator<AttrTy>(begin()),
+                            attr_value_iterator<AttrTy>(end()));
+  }
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index b48930c4ddab0..7035c2e55bcd7 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -6,10 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVM.h"
 #include "mlir/Conversion/StandardToLLVM/ConvertStandardToLLVMPass.h"
-#include "mlir/Conversion/VectorToLLVM/ConvertVectorToLLVM.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/StandardOps/Ops.h"
 #include "mlir/Dialect/VectorOps/VectorOps.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/Builders.h"
@@ -31,6 +32,7 @@
 #include "llvm/Support/ErrorHandling.h"
 
 using namespace mlir;
+using namespace mlir::vector;
 
 template <typename T>
 static LLVM::LLVMType getPtrToElementType(T containerType,
@@ -723,15 +725,108 @@ class VectorPrintOpConversion : public LLVMOpLowering {
   }
 };
 
+// TODO(rriddle): Better support for attribute subtype forwarding + slicing.
+static SmallVector<int64_t, 4> getI64SubArray(ArrayAttr arrayAttr,
+                                              unsigned dropFront = 0,
+                                              unsigned dropBack = 0) {
+  assert(arrayAttr.size() > dropFront + dropBack && "Out of bounds");
+  auto range = arrayAttr.getAsRange<IntegerAttr>();
+  SmallVector<int64_t, 4> res;
+  res.reserve(arrayAttr.size() - dropFront - dropBack);
+  for (auto it = range.begin() + dropFront, eit = range.end() - dropBack;
+       it != eit; ++it)
+    res.push_back((*it).getValue().getSExtValue());
+  return res;
+}
+
+/// Emit the proper `ExtractOp` or `ExtractElementOp` depending on the rank
+/// of `vector`.
+static Value extractOne(PatternRewriter &rewriter, Location loc, Value vector,
+                        int64_t offset) {
+  auto vectorType = vector.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<ExtractOp>(loc, vector, offset);
+  return rewriter.create<vector::ExtractElementOp>(
+      loc, vectorType.getElementType(), vector,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
+/// Emit the proper `InsertOp` or `InsertElementOp` depending on the rank
+/// of `vector`.
+static Value insertOne(PatternRewriter &rewriter, Location loc, Value from,
+                       Value into, int64_t offset) {
+  auto vectorType = into.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<InsertOp>(loc, from, into, offset);
+  return rewriter.create<vector::InsertElementOp>(
+      loc, vectorType, from, into,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
+/// Progressive lowering of StridedSliceOp to either:
+///   1. extractelement + insertelement for the 1-D case
+///   2. extract + optional strided_slice + insert for the n-D case.
+class VectorStridedSliceOpRewritePattern
+    : public OpRewritePattern<StridedSliceOp> {
+public:
+  using OpRewritePattern<StridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(StridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto dstType = op.getResult().getType().cast<VectorType>();
+
+    assert(!op.offsets().getValue().empty() && "Unexpected empty offsets");
+
+    int64_t offset =
+        op.offsets().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t size = op.sizes().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t stride =
+        op.strides().getValue().front().cast<IntegerAttr>().getInt();
+
+    auto loc = op.getLoc();
+    auto elemType = dstType.getElementType();
+    assert(elemType.isIntOrIndexOrFloat());
+    Value zero = rewriter.create<ConstantOp>(loc, elemType,
+                                             rewriter.getZeroAttr(elemType));
+    Value res = rewriter.create<SplatOp>(loc, dstType, zero);
+    for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
+         off += stride, ++idx) {
+      Value extracted = extractOne(rewriter, loc, op.vector(), off);
+      if (op.offsets().getValue().size() > 1) {
+        StridedSliceOp stridedSliceOp = rewriter.create<StridedSliceOp>(
+            loc, extracted, getI64SubArray(op.offsets(), /* dropFront=*/1),
+            getI64SubArray(op.sizes(), /* dropFront=*/1),
+            getI64SubArray(op.strides(), /* dropFront=*/1));
+        // Call matchAndRewrite recursively from within the pattern. This
+        // circumvents the current limitation that a given pattern cannot
+        // be called multiple times by the PatternRewrite infrastructure (to
+        // avoid infinite recursion, but in this case, infinite recursion
+        // cannot happen because the rank is strictly decreasing).
+        // TODO(rriddle, nicolasvasilache) Implement something like a hook for
+        // a potential function that must decrease and allow the same pattern
+        // multiple times.
+        auto success = matchAndRewrite(stridedSliceOp, rewriter);
+        (void)success;
+        assert(success && "Unexpected failure");
+        extracted = stridedSliceOp;
+      }
+      res = insertOne(rewriter, loc, extracted, res, idx);
+    }
+    rewriter.replaceOp(op, {res});
+    return matchSuccess();
+  }
+};
+
 /// Populate the given list with patterns that convert from Vector to LLVM.
 void mlir::populateVectorToLLVMConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
+  MLIRContext *ctx = converter.getDialect()->getContext();
+  patterns.insert<VectorStridedSliceOpRewritePattern>(ctx);
   patterns.insert<VectorBroadcastOpConversion, VectorShuffleOpConversion,
                   VectorExtractElementOpConversion, VectorExtractOpConversion,
                   VectorInsertElementOpConversion, VectorInsertOpConversion,
                   VectorOuterProductOpConversion, VectorTypeCastOpConversion,
-                  VectorPrintOpConversion>(converter.getDialect()->getContext(),
-                                           converter);
+                  VectorPrintOpConversion>(ctx, converter);
 }
 
 namespace {
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 1725a0b7c75c3..3a0012114302b 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -423,3 +423,64 @@ func @vector_print_vector(%arg0: vector<2x2xf32>) {
 //       CHECK:    llvm.call @print_close() : () -> ()
 //       CHECK:    llvm.call @print_close() : () -> ()
 //       CHECK:    llvm.call @print_newline() : () -> ()
+
+
+func @strided_slice(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>, %arg2: vector<4x8x16xf32>) {
+// CHECK-LABEL: llvm.func @strided_slice(
+  
+  %0 = vector.strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement %{{.*}}, %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement %{{.*}}, %{{.*}}[%{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+
+  %1 = vector.strided_slice %arg1 {offsets = [2], sizes = [2], strides = [1]} : vector<4x8xf32> to vector<2x8xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2x8xf32>) : !llvm<"[2 x <8 x float>]">
+//       CHECK:    llvm.extractvalue %{{.*}}[2] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.insertvalue %{{.*}}, %{{.*}}[0] : !llvm<"[2 x <8 x float>]">
+//       CHECK:    llvm.extractvalue %{{.*}}[3] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.insertvalue %{{.*}}, %{{.*}}[1] : !llvm<"[2 x <8 x float>]">
+
+  %2 = vector.strided_slice %arg1 {offsets = [2, 2], sizes = [2, 2], strides = [1, 1]} : vector<4x8xf32> to vector<2x2xf32>
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2x2xf32>) : !llvm<"[2 x <2 x float>]">
+//
+// Subvector vector<8xf32> @2
+//       CHECK:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.insertvalue {{.*}}, {{.*}}[0] : !llvm<"[2 x <2 x float>]">
+//
+// Subvector vector<8xf32> @3
+//       CHECK:    llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <8 x float>]">
+//       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
+//       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(2 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(0 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.mlir.constant(3 : index) : !llvm.i64
+//       CHECK:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<8 x float>">
+//       CHECK:    llvm.mlir.constant(1 : index) : !llvm.i64
+//       CHECK:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//       CHECK:    llvm.insertvalue {{.*}}, {{.*}}[1] : !llvm<"[2 x <2 x float>]">
+
+  return
+}
+
+

From 2d515e49d89c0738ccef8f1733d5f9afe00ee979 Mon Sep 17 00:00:00 2001
From: Nicolas Vasilache <ntv@google.com>
Date: Thu, 9 Jan 2020 03:12:41 -0500
Subject: [PATCH 28/89] [mlir][VectorOps] Implement insert_strided_slice
 conversion

Summary:
This diff implements the progressive lowering of insert_strided_slice.
Two cases appear:
1. when the source and dest vectors have different ranks, extract the dest
subvector at the proper offset and reduce to case 2.
2. when they have the same rank N:
  a. if the source and dest type are the same, the insertion is trivial:
     just forward the source
  b. otherwise, iterate over all N-1 D subvectors and create an
     extract/insert_strided_slice/insert replacement, reducing the problem
     to vecotrs of the same N-1 rank.

This combines properly with the other conversion patterns to lower all the way to LLVM.

Reviewers: ftynse, rriddle, AlexEichenberger, andydavis1, tetuante, nicolasvasilache

Reviewed By: andydavis1

Subscribers: merge_guards_bot, mehdi_amini, jpienaar, burmako, shauheen, antiagainst, arpith-jacob, mgester, lucyrfox, llvm-commits

Tags: #llvm

Differential Revision: https://reviews.llvm.org/D72317
---
 .../VectorToLLVM/ConvertVectorToLLVM.cpp      | 215 ++++++++++++++----
 .../VectorToLLVM/vector-to-llvm.mlir          |  42 +++-
 2 files changed, 215 insertions(+), 42 deletions(-)

diff --git a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
index 7035c2e55bcd7..1fbee9742e0d0 100644
--- a/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
+++ b/mlir/lib/Conversion/VectorToLLVM/ConvertVectorToLLVM.cpp
@@ -70,6 +70,17 @@ static Value insertOne(ConversionPatternRewriter &rewriter,
                                               rewriter.getI64ArrayAttr(pos));
 }
 
+// Helper that picks the proper sequence for inserting.
+static Value insertOne(PatternRewriter &rewriter, Location loc, Value from,
+                       Value into, int64_t offset) {
+  auto vectorType = into.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<InsertOp>(loc, from, into, offset);
+  return rewriter.create<vector::InsertElementOp>(
+      loc, vectorType, from, into,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
 // Helper that picks the proper sequence for extracting.
 static Value extractOne(ConversionPatternRewriter &rewriter,
                         LLVMTypeConverter &lowering, Location loc, Value val,
@@ -86,6 +97,32 @@ static Value extractOne(ConversionPatternRewriter &rewriter,
                                                rewriter.getI64ArrayAttr(pos));
 }
 
+// Helper that picks the proper sequence for extracting.
+static Value extractOne(PatternRewriter &rewriter, Location loc, Value vector,
+                        int64_t offset) {
+  auto vectorType = vector.getType().cast<VectorType>();
+  if (vectorType.getRank() > 1)
+    return rewriter.create<ExtractOp>(loc, vector, offset);
+  return rewriter.create<vector::ExtractElementOp>(
+      loc, vectorType.getElementType(), vector,
+      rewriter.create<ConstantIndexOp>(loc, offset));
+}
+
+// Helper that returns a subset of `arrayAttr` as a vector of int64_t.
+// TODO(rriddle): Better support for attribute subtype forwarding + slicing.
+static SmallVector<int64_t, 4> getI64SubArray(ArrayAttr arrayAttr,
+                                              unsigned dropFront = 0,
+                                              unsigned dropBack = 0) {
+  assert(arrayAttr.size() > dropFront + dropBack && "Out of bounds");
+  auto range = arrayAttr.getAsRange<IntegerAttr>();
+  SmallVector<int64_t, 4> res;
+  res.reserve(arrayAttr.size() - dropFront - dropBack);
+  for (auto it = range.begin() + dropFront, eit = range.end() - dropBack;
+       it != eit; ++it)
+    res.push_back((*it).getValue().getSExtValue());
+  return res;
+}
+
 class VectorBroadcastOpConversion : public LLVMOpLowering {
 public:
   explicit VectorBroadcastOpConversion(MLIRContext *context,
@@ -464,6 +501,139 @@ class VectorInsertOpConversion : public LLVMOpLowering {
   }
 };
 
+// When ranks are different, InsertStridedSlice needs to extract a properly
+// ranked vector from the destination vector into which to insert. This pattern
+// only takes care of this part and forwards the rest of the conversion to
+// another pattern that converts InsertStridedSlice for operands of the same
+// rank.
+//
+// RewritePattern for InsertStridedSliceOp where source and destination vectors
+// have different ranks. In this case:
+//   1. the proper subvector is extracted from the destination vector
+//   2. a new InsertStridedSlice op is created to insert the source in the
+//   destination subvector
+//   3. the destination subvector is inserted back in the proper place
+//   4. the op is replaced by the result of step 3.
+// The new InsertStridedSlice from step 2. will be picked up by a
+// `VectorInsertStridedSliceOpSameRankRewritePattern`.
+class VectorInsertStridedSliceOpDifferentRankRewritePattern
+    : public OpRewritePattern<InsertStridedSliceOp> {
+public:
+  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(InsertStridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto srcType = op.getSourceVectorType();
+    auto dstType = op.getDestVectorType();
+
+    if (op.offsets().getValue().empty())
+      return matchFailure();
+
+    auto loc = op.getLoc();
+    int64_t rankDiff = dstType.getRank() - srcType.getRank();
+    assert(rankDiff >= 0);
+    if (rankDiff == 0)
+      return matchFailure();
+
+    int64_t rankRest = dstType.getRank() - rankDiff;
+    // Extract / insert the subvector of matching rank and InsertStridedSlice
+    // on it.
+    Value extracted =
+        rewriter.create<ExtractOp>(loc, op.dest(),
+                                   getI64SubArray(op.offsets(), /*dropFront=*/0,
+                                                  /*dropFront=*/rankRest));
+    // A different pattern will kick in for InsertStridedSlice with matching
+    // ranks.
+    auto stridedSliceInnerOp = rewriter.create<InsertStridedSliceOp>(
+        loc, op.source(), extracted,
+        getI64SubArray(op.offsets(), /*dropFront=*/rankDiff),
+        getI64SubArray(op.strides(), /*dropFront=*/rankDiff));
+    rewriter.replaceOpWithNewOp<InsertOp>(
+        op, stridedSliceInnerOp.getResult(), op.dest(),
+        getI64SubArray(op.offsets(), /*dropFront=*/0,
+                       /*dropFront=*/rankRest));
+    return matchSuccess();
+  }
+};
+
+// RewritePattern for InsertStridedSliceOp where source and destination vectors
+// have the same rank. In this case, we reduce
+//   1. the proper subvector is extracted from the destination vector
+//   2. a new InsertStridedSlice op is created to insert the source in the
+//   destination subvector
+//   3. the destination subvector is inserted back in the proper place
+//   4. the op is replaced by the result of step 3.
+// The new InsertStridedSlice from step 2. will be picked up by a
+// `VectorInsertStridedSliceOpSameRankRewritePattern`.
+class VectorInsertStridedSliceOpSameRankRewritePattern
+    : public OpRewritePattern<InsertStridedSliceOp> {
+public:
+  using OpRewritePattern<InsertStridedSliceOp>::OpRewritePattern;
+
+  PatternMatchResult matchAndRewrite(InsertStridedSliceOp op,
+                                     PatternRewriter &rewriter) const override {
+    auto srcType = op.getSourceVectorType();
+    auto dstType = op.getDestVectorType();
+
+    if (op.offsets().getValue().empty())
+      return matchFailure();
+
+    int64_t rankDiff = dstType.getRank() - srcType.getRank();
+    assert(rankDiff >= 0);
+    if (rankDiff != 0)
+      return matchFailure();
+
+    if (srcType == dstType) {
+      rewriter.replaceOp(op, op.source());
+      return matchSuccess();
+    }
+
+    int64_t offset =
+        op.offsets().getValue().front().cast<IntegerAttr>().getInt();
+    int64_t size = srcType.getShape().front();
+    int64_t stride =
+        op.strides().getValue().front().cast<IntegerAttr>().getInt();
+
+    auto loc = op.getLoc();
+    Value res = op.dest();
+    // For each slice of the source vector along the most major dimension.
+    for (int64_t off = offset, e = offset + size * stride, idx = 0; off < e;
+         off += stride, ++idx) {
+      // 1. extract the proper subvector (or element) from source
+      Value extractedSource = extractOne(rewriter, loc, op.source(), idx);
+      if (extractedSource.getType().isa<VectorType>()) {
+        // 2. If we have a vector, extract the proper subvector from destination
+        // Otherwise we are at the element level and no need to recurse.
+        Value extractedDest = extractOne(rewriter, loc, op.dest(), off);
+        // 3. Reduce the problem to lowering a new InsertStridedSlice op with
+        // smaller rank.
+        InsertStridedSliceOp insertStridedSliceOp =
+            rewriter.create<InsertStridedSliceOp>(
+                loc, extractedSource, extractedDest,
+                getI64SubArray(op.offsets(), /* dropFront=*/1),
+                getI64SubArray(op.strides(), /* dropFront=*/1));
+        // Call matchAndRewrite recursively from within the pattern. This
+        // circumvents the current limitation that a given pattern cannot
+        // be called multiple times by the PatternRewrite infrastructure (to
+        // avoid infinite recursion, but in this case, infinite recursion
+        // cannot happen because the rank is strictly decreasing).
+        // TODO(rriddle, nicolasvasilache) Implement something like a hook for
+        // a potential function that must decrease and allow the same pattern
+        // multiple times.
+        auto success = matchAndRewrite(insertStridedSliceOp, rewriter);
+        (void)success;
+        assert(success && "Unexpected failure");
+        extractedSource = insertStridedSliceOp;
+      }
+      // 4. Insert the extractedSource into the res vector.
+      res = insertOne(rewriter, loc, extractedSource, res, off);
+    }
+
+    rewriter.replaceOp(op, res);
+    return matchSuccess();
+  }
+};
+
 class VectorOuterProductOpConversion : public LLVMOpLowering {
 public:
   explicit VectorOuterProductOpConversion(MLIRContext *context,
@@ -725,49 +895,10 @@ class VectorPrintOpConversion : public LLVMOpLowering {
   }
 };
 
-// TODO(rriddle): Better support for attribute subtype forwarding + slicing.
-static SmallVector<int64_t, 4> getI64SubArray(ArrayAttr arrayAttr,
-                                              unsigned dropFront = 0,
-                                              unsigned dropBack = 0) {
-  assert(arrayAttr.size() > dropFront + dropBack && "Out of bounds");
-  auto range = arrayAttr.getAsRange<IntegerAttr>();
-  SmallVector<int64_t, 4> res;
-  res.reserve(arrayAttr.size() - dropFront - dropBack);
-  for (auto it = range.begin() + dropFront, eit = range.end() - dropBack;
-       it != eit; ++it)
-    res.push_back((*it).getValue().getSExtValue());
-  return res;
-}
-
-/// Emit the proper `ExtractOp` or `ExtractElementOp` depending on the rank
-/// of `vector`.
-static Value extractOne(PatternRewriter &rewriter, Location loc, Value vector,
-                        int64_t offset) {
-  auto vectorType = vector.getType().cast<VectorType>();
-  if (vectorType.getRank() > 1)
-    return rewriter.create<ExtractOp>(loc, vector, offset);
-  return rewriter.create<vector::ExtractElementOp>(
-      loc, vectorType.getElementType(), vector,
-      rewriter.create<ConstantIndexOp>(loc, offset));
-}
-
-/// Emit the proper `InsertOp` or `InsertElementOp` depending on the rank
-/// of `vector`.
-static Value insertOne(PatternRewriter &rewriter, Location loc, Value from,
-                       Value into, int64_t offset) {
-  auto vectorType = into.getType().cast<VectorType>();
-  if (vectorType.getRank() > 1)
-    return rewriter.create<InsertOp>(loc, from, into, offset);
-  return rewriter.create<vector::InsertElementOp>(
-      loc, vectorType, from, into,
-      rewriter.create<ConstantIndexOp>(loc, offset));
-}
-
 /// Progressive lowering of StridedSliceOp to either:
 ///   1. extractelement + insertelement for the 1-D case
 ///   2. extract + optional strided_slice + insert for the n-D case.
-class VectorStridedSliceOpRewritePattern
-    : public OpRewritePattern<StridedSliceOp> {
+class VectorStridedSliceOpConversion : public OpRewritePattern<StridedSliceOp> {
 public:
   using OpRewritePattern<StridedSliceOp>::OpRewritePattern;
 
@@ -821,7 +952,9 @@ class VectorStridedSliceOpRewritePattern
 void mlir::populateVectorToLLVMConversionPatterns(
     LLVMTypeConverter &converter, OwningRewritePatternList &patterns) {
   MLIRContext *ctx = converter.getDialect()->getContext();
-  patterns.insert<VectorStridedSliceOpRewritePattern>(ctx);
+  patterns.insert<VectorInsertStridedSliceOpDifferentRankRewritePattern,
+                  VectorInsertStridedSliceOpSameRankRewritePattern,
+                  VectorStridedSliceOpConversion>(ctx);
   patterns.insert<VectorBroadcastOpConversion, VectorShuffleOpConversion,
                   VectorExtractElementOpConversion, VectorExtractOpConversion,
                   VectorInsertElementOpConversion, VectorInsertOpConversion,
diff --git a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
index 3a0012114302b..e01a23343652f 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-to-llvm.mlir
@@ -427,7 +427,6 @@ func @vector_print_vector(%arg0: vector<2x2xf32>) {
 
 func @strided_slice(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>, %arg2: vector<4x8x16xf32>) {
 // CHECK-LABEL: llvm.func @strided_slice(
-  
   %0 = vector.strided_slice %arg0 {offsets = [2], sizes = [2], strides = [1]} : vector<4xf32> to vector<2xf32>
 //       CHECK:    llvm.mlir.constant(0.000000e+00 : f32) : !llvm.float
 //       CHECK:    llvm.mlir.constant(dense<0.000000e+00> : vector<2xf32>) : !llvm<"<2 x float>">
@@ -483,4 +482,45 @@ func @strided_slice(%arg0: vector<4xf32>, %arg1: vector<4x8xf32>, %arg2: vector<
   return
 }
 
+func @insert_strided_slice(%a: vector<2x2xf32>, %b: vector<4x4xf32>, %c: vector<4x4x4xf32>) {
+// CHECK-LABEL: @insert_strided_slice
+
+  %0 = vector.insert_strided_slice %b, %c {offsets = [2, 0, 0], strides = [1, 1]} : vector<4x4xf32> into vector<4x4x4xf32>
+//       CHECK:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x [4 x <4 x float>]]">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x [4 x <4 x float>]]">
+
+  %1 = vector.insert_strided_slice %a, %b {offsets = [2, 2], strides = [1, 1]} : vector<2x2xf32> into vector<4x4xf32>
+//
+// Subvector vector<2xf32> @0 into vector<4xf32> @2
+//       CHECK:    llvm.extractvalue {{.*}}[0] : !llvm<"[2 x <2 x float>]">
+//  CHECK-NEXT:    llvm.extractvalue {{.*}}[2] : !llvm<"[4 x <4 x float>]">
+// Element @0 -> element @2
+//  CHECK-NEXT:    llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(2 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+// Element @1 -> element @3
+//  CHECK-NEXT:    llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(3 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[2] : !llvm<"[4 x <4 x float>]">
+//
+// Subvector vector<2xf32> @1 into vector<4xf32> @3
+//       CHECK:    llvm.extractvalue {{.*}}[1] : !llvm<"[2 x <2 x float>]">
+//  CHECK-NEXT:    llvm.extractvalue {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+// Element @0 -> element @2
+//  CHECK-NEXT:    llvm.mlir.constant(0 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(2 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+// Element @1 -> element @3
+//  CHECK-NEXT:    llvm.mlir.constant(1 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.extractelement {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<2 x float>">
+//  CHECK-NEXT:    llvm.mlir.constant(3 : index) : !llvm.i64
+//  CHECK-NEXT:    llvm.insertelement {{.*}}, {{.*}}[{{.*}} : !llvm.i64] : !llvm<"<4 x float>">
+//  CHECK-NEXT:    llvm.insertvalue {{.*}}, {{.*}}[3] : !llvm<"[4 x <4 x float>]">
+
+  return
+}
 

From e93e0d413f3afa1df5c5f88df546bebcd1183155 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 9 Jan 2020 08:33:47 +0000
Subject: [PATCH 29/89] [ARM][LowOverheadLoops] Update liveness info

After expanding the pseudo instructions, update the liveness info.
We do this in a post-order traversal of the loop, including its
exit blocks and preheader(s).

Differential Revision: https://reviews.llvm.org/D72131
---
 llvm/lib/CodeGen/LivePhysRegs.cpp             | 13 +++
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   | 64 ++++++++++++
 .../Thumb2/LowOverheadLoops/disjoint-vcmp.mir | 99 +++++++++----------
 .../dont-remove-loop-update.mir               |  2 +-
 .../dont-remove-loop-update2.mir              |  4 +-
 .../dont-remove-loop-update3.mir              |  2 +-
 .../Thumb2/LowOverheadLoops/loop-guards.ll    | 22 ++---
 .../LowOverheadLoops/mov-lr-terminator.mir    |  6 +-
 .../move-def-before-start.mir                 |  6 +-
 .../LowOverheadLoops/move-start-after-def.mir |  6 +-
 .../LowOverheadLoops/unsafe-cpsr-loop-use.mir | 11 ++-
 .../Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir | 10 +-
 .../Thumb2/LowOverheadLoops/vctp-in-vpt.mir   | 10 +-
 .../CodeGen/Thumb2/LowOverheadLoops/while.mir |  9 +-
 .../CodeGen/Thumb2/LowOverheadLoops/wlstp.mir | 24 ++---
 15 files changed, 180 insertions(+), 108 deletions(-)

diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp
index 7a5cffca34700..547970e7ab5d9 100644
--- a/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -276,6 +276,7 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // We walk through the block backwards and start with the live outs.
   LivePhysRegs LiveRegs;
@@ -294,6 +295,18 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
+
+      // Special-case return instructions for cases when a return is not
+      // the last instruction in the block.
+      if (MI.isReturn() && MFI.isCalleeSavedInfoValid()) {
+        for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
+          if (Info.getReg() == Reg) {
+            IsNotLive = !Info.isRestored();
+            break;
+          }
+        }
+      }
+
       MO->setIsDead(IsNotLive);
     }
 
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 31a98d86a54d2..d8b6a0e47d047 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -43,6 +43,61 @@ using namespace llvm;
 
 namespace {
 
+  class PostOrderLoopTraversal {
+    MachineLoop &ML;
+    MachineLoopInfo &MLI;
+    SmallPtrSet<MachineBasicBlock*, 4> Visited;
+    SmallVector<MachineBasicBlock*, 4> Order;
+
+  public:
+    PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
+      : ML(ML), MLI(MLI) { }
+
+    const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
+      return Order;
+    }
+
+    // Visit all the blocks within the loop, as well as exit blocks and any
+    // blocks properly dominating the header.
+    void ProcessLoop() {
+      std::function<void(MachineBasicBlock*)> Search = [this, &Search]
+        (MachineBasicBlock *MBB) -> void {
+        if (Visited.count(MBB))
+          return;
+
+        Visited.insert(MBB);
+        for (auto *Succ : MBB->successors()) {
+          if (!ML.contains(Succ))
+            continue;
+          Search(Succ);
+        }
+        Order.push_back(MBB);
+      };
+
+      // Insert exit blocks.
+      SmallVector<MachineBasicBlock*, 2> ExitBlocks;
+      ML.getExitBlocks(ExitBlocks);
+      for (auto *MBB : ExitBlocks)
+        Order.push_back(MBB);
+
+      // Then add the loop body.
+      Search(ML.getHeader());
+
+      // Then try the preheader and its predecessors.
+      std::function<void(MachineBasicBlock*)> GetPredecessor =
+        [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
+        Order.push_back(MBB);
+        if (MBB->pred_size() == 1)
+          GetPredecessor(*MBB->pred_begin());
+      };
+
+      if (auto *Preheader = ML.getLoopPreheader())
+        GetPredecessor(Preheader);
+      else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
+        GetPredecessor(Preheader);
+    }
+  };
+
   struct PredicatedMI {
     MachineInstr *MI = nullptr;
     SetVector<MachineInstr*> Predicates;
@@ -976,6 +1031,15 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
       ConvertVPTBlocks(LoLoop);
     }
   }
+
+  PostOrderLoopTraversal DFS(*LoLoop.ML, *MLI);
+  DFS.ProcessLoop();
+  const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
+  for (auto *MBB : PostOrder)
+    recomputeLiveIns(*MBB);
+
+  for (auto *MBB : reverse(PostOrder))
+    recomputeLivenessFlags(*MBB);
 }
 
 bool ARMLowOverheadLoops::RevertNonLoops() {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
index cafd1317c57ca..aa8954df1c591 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
+#
 --- |
   @mask = external global i16
-  ; Function Attrs: nofree norecurse nounwind
   define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32* noalias nocapture readonly %arg3) local_unnamed_addr #0 {
   bb:
     %tmp = icmp eq i32 %arg2, 0
@@ -16,8 +16,6 @@
     %mask.load = load i16, i16* %mask.gep9
     %conv.mask = zext i16 %mask.load to i32
     %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
-    %mask.insert = insertelement <4 x i32> undef, i32 %conv.mask, i32 0
-    %invariant.limits = shufflevector <4 x i32> %mask.insert, <4 x i32> undef, <4 x i32> zeroinitializer
     br i1 %tmp, label %bb27, label %bb3
 
   bb3:                                              ; preds = %bb
@@ -31,18 +29,20 @@
     %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
     %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
     %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
-    %lsr.iv47 = bitcast i32* %lsr.iv4 to <4 x i32>*
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
+    %lsr.iv47 = bitcast i32* %lsr.iv4 to <4 x i32>*
     %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
     %and = and <4 x i1> %vctp, %invariant.mask
     %tmp11 = sub i32 %tmp8, 4
     %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
     %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef)
     %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
-    %scevgep8 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv47, i32 1
-    %load.limits = load <4 x i32>, <4 x i32>* %scevgep8
-    %bad.icmp = icmp ule <4 x i32> %load.limits, %invariant.limits
+    %scevgep2 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv47, i32 1
+    %load.limits = load <4 x i32>, <4 x i32>* %scevgep2
+    %0 = insertelement <4 x i32> undef, i32 %conv.mask, i32 0
+    %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
+    %bad.icmp = icmp ule <4 x i32> %load.limits, %1
     call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %bad.icmp)
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
     %tmp13 = icmp ne i32 %tmp12, 0
@@ -54,13 +54,12 @@
   bb27:                                             ; preds = %bb9, %bb
     ret void
   }
-  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
-  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
-  declare void @llvm.set.loop.iterations.i32(i32) #3
-  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
-  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
-  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
-  declare void @llvm.stackprotector(i8*, i8**) #5
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
+  declare void @llvm.set.loop.iterations.i32(i32)
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
+  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
 
 ...
 ---
@@ -85,7 +84,7 @@ frameInfo:
   hasStackMap:     false
   hasPatchPoint:   false
   stackSize:       20
-  offsetAdjustment: -12
+  offsetAdjustment: 0
   maxAlignment:    4
   adjustsStack:    false
   hasCalls:        false
@@ -110,7 +109,7 @@ stack:
       stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
   - { id: 3, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
-      stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true,
+      stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
   - { id: 4, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
       stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
@@ -122,37 +121,36 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
-  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $r3, $r2, $r0, $r1, $r4, $r5, $r7, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r6, -12
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r5, -12
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -16
-  ; CHECK:   $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
-  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa $r7, 8
   ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 20
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $r3, $r2, $r0, $r1
   ; CHECK:   $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
-  ; CHECK:   renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
   ; CHECK:   $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
-  ; CHECK:   renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
-  ; CHECK:   renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
-  ; CHECK:   renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $r4 = t2BICri killed renamable $r4, 3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r4, 4, 14, $noreg, $noreg
   ; CHECK:   renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
-  ; CHECK:   $vpr = VMSR_P0 $r12, 14, $noreg
-  ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
+  ; CHECK:   $vpr = VMSR_P0 $r5, 14, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+  ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
-  ; CHECK:   $lr = t2DLS renamable $lr
+  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r2, $r12, $r0, $q0, $r1, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 2, implicit $vpr
   ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
@@ -160,28 +158,26 @@ body:             |
   ; CHECK:   renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
-  ; CHECK:   renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
-  ; CHECK:   MVE_VPST 8, implicit $vpr
+  ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8)
+  ; CHECK:   MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
-  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
   bb.0.bb:
     successors: %bb.3(0x30000000), %bb.1(0x50000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
+    liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
 
-    frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
     frame-setup CFI_INSTRUCTION def_cfa_offset 16
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    frame-setup CFI_INSTRUCTION offset $r6, -12
+    frame-setup CFI_INSTRUCTION offset $r5, -12
     frame-setup CFI_INSTRUCTION offset $r4, -16
-    $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
-    frame-setup CFI_INSTRUCTION def_cfa $r7, 8
     $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa_offset 20
     tCBZ $r2, %bb.3
 
   bb.1.bb3:
@@ -189,17 +185,17 @@ body:             |
     liveins: $r0, $r1, $r2, $r3
 
     $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
-    renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
+    renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
     $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
-    renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
-    renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
-    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
+    renamable $r4 = t2BICri killed renamable $r4, 3, 14, $noreg, $noreg
+    renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+    renamable $r12 = t2SUBri killed renamable $r4, 4, 14, $noreg, $noreg
     renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
-    $vpr = VMSR_P0 $r12, 14, $noreg
-    renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
+    $vpr = VMSR_P0 $r5, 14, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14, $noreg, $noreg
     renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
-    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
+    renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
     $r3 = tMOVr $r0, 14, $noreg
     t2DoLoopStart renamable $lr
 
@@ -214,9 +210,8 @@ body:             |
     renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
     renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-    renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
-    renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
-    MVE_VPST 8, implicit $vpr
+    renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8)
+    MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr
     MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
     renamable $lr = t2LoopDec killed renamable $lr, 1
     $r0 = tMOVr $r3, 14, $noreg
@@ -225,6 +220,6 @@ body:             |
 
   bb.3.bb27:
     $sp = tADDspi $sp, 1, 14, $noreg
-    tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
+    tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
 
 ...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
index 976c5f5d7ba36..414ccd94804bd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@@ -6,7 +6,7 @@
 # CHECK:  bb.1.vector.body:
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
+# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
index 9e429040db4fd..68f245f2ed910 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
@@ -7,8 +7,8 @@
 # We shouldn't optimise away the SUB.
 
 # CHECK:  bb.1.vector.body:
-# CHECK:    renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
+# CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
index ab7fcf843d7dc..e99cf7ad631ec 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
@@ -8,7 +8,7 @@
 
 # CHECK:  bb.1.vector.body:
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
+# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
index fd8cc9b92f2fa..f23b64013c355 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@@ -18,9 +18,9 @@
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -51,9 +51,9 @@ if.end:                                           ; preds = %while.body, %entry
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -86,9 +86,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -121,9 +121,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %cmp = icmp ne i32 %N, 0
@@ -152,9 +152,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: be_ne
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   $lr = t2DLS renamable $lr
+; CHECK:   $lr = t2DLS killed renamable $lr
 ; CHECK: bb.1.do.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.1
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %cmp = icmp ne i32 %N, 0
@@ -187,7 +187,7 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK: bb.1.do.body.preheader:
 ; CHECK:   $lr = tMOVr
 ; CHECK: bb.2.do.body:
-; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
 define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   br label %do.body.preheader
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
index 57fe0492f1eb2..ef8e726eeaed8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
@@ -100,7 +100,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -114,7 +114,7 @@ body:             |
   ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -123,7 +123,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
index ff49bb0770eec..a4fceb6f54ae0 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -107,7 +107,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -122,7 +122,7 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -131,7 +131,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
index 3fb203ee19389..77c70ab4d22b3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -105,7 +105,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -120,7 +120,7 @@ body:             |
   ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -129,7 +129,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
index 5f4a10249684b..f8e5351d4e6c7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@@ -94,7 +94,7 @@ body:             |
   ; CHECK-LABEL: name: do_copy
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
+  ; CHECK:   liveins: $r2, $r1, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -102,18 +102,19 @@ body:             |
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
   ; CHECK:   renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
-  ; CHECK:   $lr = tMOVr killed $r0, 14, $noreg
+  ; CHECK:   $lr = tMOVr $r0, 14, $noreg
   ; CHECK: bb.1.while.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   liveins: $lr, $r0, $r2, $r1
   ; CHECK:   renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
   ; CHECK:   tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
   ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, $noreg
   ; CHECK:   t2IT 2, 8, implicit-def $itstate
-  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
+  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
   ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
+  ; CHECK:   renamable $lr = tMOVr killed $lr, 14, $noreg
   ; CHECK:   t2CMPri $lr, 0, 14, $noreg, implicit-def $cpsr
-  ; CHECK:   tBcc %bb.1, 1, $cpsr
+  ; CHECK:   tBcc %bb.1, 1, killed $cpsr
   ; CHECK:   tB %bb.2, 14, $noreg
   ; CHECK: bb.2.while.end:
   ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
index 83dc8731f6542..a8d0b500af1e9 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -134,7 +134,7 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
+  ; CHECK:   liveins: $r1, $r2, $r0, $r3, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -145,7 +145,7 @@ body:             |
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $r1, $r2, $r0, $r3
   ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
   ; CHECK:   renamable $lr = t2MOVi 1, 14, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
@@ -154,10 +154,10 @@ body:             |
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
   ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS renamable $lr
+  ; CHECK:   $lr = t2DLS killed renamable $lr
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r2, $r3, $r0, $r1
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
@@ -171,7 +171,7 @@ body:             |
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
index fd6345693e835..00bb972580a1c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
@@ -131,7 +131,7 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
+  ; CHECK:   liveins: $r0, $r2, $r1, $r3, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -142,14 +142,14 @@ body:             |
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $r0, $r2, $r1, $r3
   ; CHECK:   $vpr = VMSR_P0 killed $r3, 14, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
-  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r2
+  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r3, $r1, $r0
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
@@ -158,7 +158,7 @@ body:             |
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
index 1ed9339a9f62b..78ec447c37c17 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
@@ -1,6 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s
-
 # TODO: Remove the lr = tMOVr
 
 --- |
@@ -91,15 +90,15 @@ body:             |
   ; CHECK-LABEL: name: copy
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
+  ; CHECK:   liveins: $r0, $r2, $r1, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   $lr = t2WLS $r2, %bb.3
+  ; CHECK:   dead $lr = t2WLS $r2, %bb.3
   ; CHECK: bb.1.while.body.preheader:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r1, $r2
+  ; CHECK:   liveins: $r0, $r2, $r1
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 2, 14, $noreg
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 2, 14, $noreg
   ; CHECK:   $lr = tMOVr killed $r2, 14, $noreg
@@ -108,7 +107,7 @@ body:             |
   ; CHECK:   liveins: $lr, $r0, $r1
   ; CHECK:   renamable $r2, renamable $r1 = t2LDRH_PRE killed renamable $r1, 2, 14, $noreg :: (load 2 from %ir.scevgep4)
   ; CHECK:   early-clobber renamable $r0 = t2STRH_PRE killed renamable $r2, killed renamable $r0, 2, 14, $noreg :: (store 2 into %ir.scevgep7)
-  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
   ; CHECK: bb.3.while.end:
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
index 33389f4c2941c..44a072ac291b3 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -199,11 +199,11 @@ body:             |
   ; CHECK:   tB %bb.3, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r1, $r2, $r3, $r0
   ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
+  ; CHECK:   liveins: $lr, $r12, $r2, $r3, $r0, $r1
   ; CHECK:   renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg
@@ -213,7 +213,7 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1)
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
@@ -313,7 +313,7 @@ body:             |
   ; CHECK-LABEL: name: test_wlstp16
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r7, $lr
+  ; CHECK:   liveins: $r1, $r3, $r0, $r2, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -322,7 +322,7 @@ body:             |
   ; CHECK:   tB %bb.2, 14, $noreg
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
+  ; CHECK:   liveins: $lr, $r3, $r0, $r2, $r1
   ; CHECK:   renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2)
   ; CHECK:   renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2)
   ; CHECK:   renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
@@ -331,7 +331,7 @@ body:             |
   ; CHECK:   renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg
   ; CHECK:   renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
   bb.0.entry:
@@ -422,7 +422,7 @@ body:             |
   ; CHECK-LABEL: name: test_wlstp32
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
-  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
+  ; CHECK:   liveins: $r1, $r2, $r0, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -432,12 +432,12 @@ body:             |
   ; CHECK:   tB %bb.4, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $lr, $r0, $r1, $r2
+  ; CHECK:   liveins: $r2, $r0, $lr, $r1
   ; CHECK:   renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
-  ; CHECK:   $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0
+  ; CHECK:   liveins: $r2, $r0, $lr, $r1, $q1
+  ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
   ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4)
   ; CHECK:   renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, killed $noreg :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   $r3 = tMOVr $r2, 14, $noreg
@@ -446,10 +446,10 @@ body:             |
   ; CHECK:   renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
-  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   liveins: $q0, $q1, $r3
+  ; CHECK:   liveins: $q1, $q0, $r3
   ; CHECK:   renamable $vpr = MVE_VCTP32 killed renamable $r3, 0, $noreg
   ; CHECK:   renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
   ; CHECK:   renamable $r12 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg

From 08778d8c4fd8a6519c7f27bfa6b09c47262cb844 Mon Sep 17 00:00:00 2001
From: Alex Zinenko <zinenko@google.com>
Date: Tue, 7 Jan 2020 20:00:54 +0100
Subject: [PATCH 30/89] [mlir][GPU] introduce utilities for promotion to
 workgroup memory

Introduce a set of function that promote a memref argument of a `gpu.func` to
workgroup memory using memory attribution. The promotion boils down to
additional loops performing the copy from the original argument to the
attributed memory in the beginning of the function, and back at the end of the
function using all available threads. The loop bounds are specified so as to
adapt to any size of the workgroup. These utilities are intended to compose
with other existing utilities (loop coalescing and tiling) in cases where the
distribution of work across threads is uneven, e.g. copying a 2D memref with
only the threads along the "x" dimension. Similarly, specialization of the
kernel to specific launch sizes should be implemented as a separate pass
combining constant propagation and canonicalization.

Introduce a simple attribute-driven pass to test the promotion transformation
since we don't have a heuristic at the moment.

Differential revision: https://reviews.llvm.org/D71904
---
 mlir/include/mlir/Dialect/GPU/GPUDialect.h    |   5 +
 mlir/include/mlir/Dialect/GPU/GPUOps.td       |   4 +
 .../mlir/Dialect/GPU/MemoryPromotion.h        |  29 +++
 mlir/include/mlir/IR/Block.h                  |   5 +
 mlir/lib/Dialect/GPU/CMakeLists.txt           |  20 +-
 mlir/lib/Dialect/GPU/IR/GPUDialect.cpp        |  18 ++
 .../GPU/Transforms/MemoryPromotion.cpp        | 173 ++++++++++++++++++
 mlir/lib/IR/Block.cpp                         |  14 ++
 mlir/test/Dialect/GPU/promotion.mlir          | 119 ++++++++++++
 mlir/test/lib/Transforms/CMakeLists.txt       |   3 +
 .../lib/Transforms/TestGpuMemoryPromotion.cpp |  40 ++++
 11 files changed, 428 insertions(+), 2 deletions(-)
 create mode 100644 mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
 create mode 100644 mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
 create mode 100644 mlir/test/Dialect/GPU/promotion.mlir
 create mode 100644 mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp

diff --git a/mlir/include/mlir/Dialect/GPU/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
index 1776ff7198052..a21b51487722e 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUDialect.h
+++ b/mlir/include/mlir/Dialect/GPU/GPUDialect.h
@@ -53,6 +53,11 @@ class GPUDialect : public Dialect {
   /// 'gpu.kernel' attribute.
   static bool isKernel(Operation *op);
 
+  /// Returns the number of workgroup (thread, block) dimensions supported in
+  /// the GPU dialect.
+  // TODO(zinenko,herhut): consider generalizing this.
+  static unsigned getNumWorkgroupDimensions() { return 3; }
+
   /// Returns the numeric value used to identify the workgroup memory address
   /// space.
   static unsigned getWorkgroupAddressSpace() { return 3; }
diff --git a/mlir/include/mlir/Dialect/GPU/GPUOps.td b/mlir/include/mlir/Dialect/GPU/GPUOps.td
index b5b93e9b553b5..766ddbf202c25 100644
--- a/mlir/include/mlir/Dialect/GPU/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/GPUOps.td
@@ -117,6 +117,10 @@ def GPU_GPUFuncOp : GPU_Op<"func", [FunctionLike, IsolatedFromAbove, Symbol]> {
   ];
 
   let extraClassDeclaration = [{
+    /// Adds a workgroup attribution of the MemRef type with the given shape and
+    /// element type.
+    Value addWorkgroupAttribution(ArrayRef<int64_t> shape, Type elementType);
+
     /// Returns `true` if the GPU function defined by this Op is a kernel, i.e.
     /// it is intended to be launched from host.
     bool isKernel() {
diff --git a/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
new file mode 100644
index 0000000000000..09c1371708f25
--- /dev/null
+++ b/mlir/include/mlir/Dialect/GPU/MemoryPromotion.h
@@ -0,0 +1,29 @@
+//===- MemoryPromotion.h - Utilities for moving data across GPU -*- C++ -*-===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header file declares the utility functions that generate IR copying
+// the data between different levels of memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+#define MLIR_DIALECT_GPU_MEMORYPROMOTION_H
+
+namespace mlir {
+
+namespace gpu {
+class GPUFuncOp;
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void promoteToWorkgroupMemory(gpu::GPUFuncOp op, unsigned arg);
+
+} // end namespace mlir
+
+#endif // MLIR_DIALECT_GPU_MEMORYPROMOTION_H
diff --git a/mlir/include/mlir/IR/Block.h b/mlir/include/mlir/IR/Block.h
index c868148f95e2e..2d3eb18d72934 100644
--- a/mlir/include/mlir/IR/Block.h
+++ b/mlir/include/mlir/IR/Block.h
@@ -79,6 +79,11 @@ class Block : public IRObjectWithUseList<BlockOperand>,
   /// Add one value to the argument list.
   BlockArgument addArgument(Type type);
 
+  /// Insert one value to the position in the argument list indicated by the
+  /// given iterator. The existing arguments are shifted. The block is expected
+  /// not to have predecessors.
+  BlockArgument insertArgument(args_iterator it, Type type);
+
   /// Add one argument to the argument list for each type specified in the list.
   iterator_range<args_iterator> addArguments(ArrayRef<Type> types);
 
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 6fe45ba49ef56..dbf05ac6ace95 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -2,9 +2,25 @@ add_llvm_library(MLIRGPU
   IR/GPUDialect.cpp
   IR/DialectRegistration.cpp
   Transforms/KernelOutlining.cpp
+  Transforms/MemoryPromotion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/GPU
 )
-add_dependencies(MLIRGPU MLIRGPUOpsIncGen MLIRIR MLIRLLVMIR LLVMSupport)
-target_link_libraries(MLIRGPU MLIRIR MLIRLLVMIR MLIRStandardOps LLVMSupport)
+add_dependencies(MLIRGPU
+  MLIRGPUOpsIncGen
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
+target_link_libraries(MLIRGPU
+  MLIREDSC
+  MLIRIR
+  MLIRLLVMIR
+  MLIRLoopOps
+  MLIRSupport
+  MLIRTransformUtils
+  LLVMSupport)
diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
index bda8032fc21a6..32d7fae65d9ce 100644
--- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
+++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp
@@ -593,6 +593,24 @@ LogicalResult verify(LaunchFuncOp op) {
 // GPUFuncOp
 //===----------------------------------------------------------------------===//
 
+/// Adds a workgroup attribution to "op" of the MemRef type with the given shape
+/// and element type.
+Value GPUFuncOp::addWorkgroupAttribution(ArrayRef<int64_t> shape,
+                                         Type elementType) {
+  unsigned pos = getNumFuncArguments() + getNumWorkgroupAttributions();
+  Block &bodyBlock = body().front();
+  Value attribution = bodyBlock.insertArgument(
+      std::next(bodyBlock.args_begin(), pos),
+      MemRefType::get(shape, elementType, /*affineMapComposition=*/{},
+                      GPUDialect::getWorkgroupAddressSpace()));
+  auto numWorkgroupBuffersAttr =
+      getAttrOfType<IntegerAttr>(getNumWorkgroupAttributionsAttrName());
+  setAttr(getNumWorkgroupAttributionsAttrName(),
+          IntegerAttr::get(numWorkgroupBuffersAttr.getType(),
+                           numWorkgroupBuffersAttr.getValue() + 1));
+  return attribution;
+}
+
 void GPUFuncOp::build(Builder *builder, OperationState &result, StringRef name,
                       FunctionType type, ArrayRef<Type> workgroupAttributions,
                       ArrayRef<Type> privateAttributions,
diff --git a/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
new file mode 100644
index 0000000000000..f01a430a216da
--- /dev/null
+++ b/mlir/lib/Dialect/GPU/Transforms/MemoryPromotion.cpp
@@ -0,0 +1,173 @@
+//===- MemoryPromotion.cpp - Utilities for moving data across GPU memories ===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities that allow one to create IR moving the data
+// across different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/LoopOps/LoopOps.h"
+#include "mlir/EDSC/Builders.h"
+#include "mlir/EDSC/Helpers.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/Functional.h"
+#include "mlir/Transforms/LoopUtils.h"
+
+using namespace mlir;
+using namespace mlir::gpu;
+
+/// Returns the textual name of a GPU dimension.
+static StringRef getDimName(unsigned dim) {
+  if (dim == 0)
+    return "x";
+  if (dim == 1)
+    return "y";
+  if (dim == 2)
+    return "z";
+
+  llvm_unreachable("dimension ID overflow");
+}
+
+/// Emits the (imperfect) loop nest performing the copy between "from" and "to"
+/// values using the bounds derived from the "from" value. Emits at least
+/// GPUDialect::getNumWorkgroupDimensions() loops, completing the nest with
+/// single-iteration loops. Maps the innermost loops to thread dimensions, in
+/// reverse order to enable access coalescing in the innermost loop.
+static void insertCopyLoops(OpBuilder &builder, Location loc,
+                            edsc::MemRefView &bounds, Value from, Value to) {
+  // Create EDSC handles for bounds.
+  unsigned rank = bounds.rank();
+  SmallVector<edsc::ValueHandle, 4> lbs, ubs, steps;
+
+  // Make sure we have enough loops to use all thread dimensions, these trivial
+  // loops should be outermost and therefore inserted first.
+  if (rank < GPUDialect::getNumWorkgroupDimensions()) {
+    unsigned extraLoops = GPUDialect::getNumWorkgroupDimensions() - rank;
+    edsc::ValueHandle zero = edsc::intrinsics::constant_index(0);
+    edsc::ValueHandle one = edsc::intrinsics::constant_index(1);
+    lbs.resize(extraLoops, zero);
+    ubs.resize(extraLoops, one);
+    steps.resize(extraLoops, one);
+  }
+
+  // Add existing bonuds.
+  lbs.append(bounds.getLbs().begin(), bounds.getLbs().end());
+  ubs.append(bounds.getUbs().begin(), bounds.getUbs().end());
+
+  // Emit constant operations for steps.
+  steps.reserve(lbs.size());
+  llvm::transform(
+      bounds.getSteps(), std::back_inserter(steps),
+      [](int64_t step) { return edsc::intrinsics::constant_index(step); });
+
+  // Obtain thread identifiers and block sizes, necessary to map to them.
+  auto indexType = builder.getIndexType();
+  SmallVector<Value, 3> threadIds, blockDims;
+  for (unsigned i = 0; i < 3; ++i) {
+    auto dimName = builder.getStringAttr(getDimName(i));
+    threadIds.push_back(
+        builder.create<gpu::ThreadIdOp>(loc, indexType, dimName));
+    blockDims.push_back(
+        builder.create<gpu::BlockDimOp>(loc, indexType, dimName));
+  }
+
+  // Produce the loop nest with copies.
+  auto ivs = edsc::makeIndexHandles(lbs.size());
+  auto ivPtrs =
+      edsc::makeHandlePointers(MutableArrayRef<edsc::IndexHandle>(ivs));
+  edsc::LoopNestBuilder(ivPtrs, lbs, ubs, steps)([&]() {
+    auto activeIvs = llvm::makeArrayRef(ivs).take_back(rank);
+    edsc::StdIndexedValue fromHandle(from), toHandle(to);
+    toHandle(activeIvs) = fromHandle(activeIvs);
+  });
+
+  // Map the innermost loops to threads in reverse order.
+  for (auto en :
+       llvm::enumerate(llvm::reverse(llvm::makeArrayRef(ivs).take_back(
+           GPUDialect::getNumWorkgroupDimensions())))) {
+    auto loop = cast<loop::ForOp>(
+        en.value().getValue().getParentRegion()->getParentOp());
+    mapLoopToProcessorIds(loop, {threadIds[en.index()]},
+                          {blockDims[en.index()]});
+  }
+}
+
+/// Emits the loop nests performing the copy to the designated location in the
+/// beginning of the region, and from the designated location immediately before
+/// the terminator of the first block of the region. The region is expected to
+/// have one block. This boils down to the following structure
+///
+///   ^bb(...):
+///     <loop-bound-computation>
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %0 = load %from[%arg0, ..., %argN]
+///           store %0, %to[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///     gpu.barrier
+///     <... original body ...>
+///     gpu.barrier
+///     for %arg0 = ... to ... step ... {
+///       ...
+///         for %argN = <thread-id-x> to ... step <block-dim-x> {
+///           %1 = load %to[%arg0, ..., %argN]
+///           store %1, %from[%arg0, ..., %argN]
+///         }
+///       ...
+///     }
+///
+/// Inserts the barriers unconditionally since different threads may be copying
+/// values and reading them. An analysis would be required to eliminate barriers
+/// in case where value is only used by the thread that copies it. Both copies
+/// are inserted unconditionally, an analysis would be required to only copy
+/// live-in and live-out values when necessary. This copies the entire memref
+/// pointed to by "from". In case a smaller block would be sufficient, the
+/// caller can create a subview of the memref and promote it instead.
+static void insertCopies(Region &region, Location loc, Value from, Value to) {
+  auto fromType = from.getType().cast<MemRefType>();
+  auto toType = to.getType().cast<MemRefType>();
+  (void)fromType;
+  (void)toType;
+  assert(fromType.getShape() == toType.getShape());
+  assert(fromType.getRank() != 0);
+  assert(has_single_element(region) &&
+         "unstructured control flow not supported");
+
+  OpBuilder builder(region.getContext());
+  builder.setInsertionPointToStart(&region.front());
+
+  edsc::ScopedContext edscContext(builder, loc);
+  edsc::MemRefView fromView(from);
+  insertCopyLoops(builder, loc, fromView, from, to);
+  builder.create<gpu::BarrierOp>(loc);
+
+  builder.setInsertionPoint(&region.front().back());
+  builder.create<gpu::BarrierOp>(loc);
+  insertCopyLoops(builder, loc, fromView, to, from);
+}
+
+/// Promotes a function argument to workgroup memory in the given function. The
+/// copies will be inserted in the beginning and in the end of the function.
+void mlir::promoteToWorkgroupMemory(GPUFuncOp op, unsigned arg) {
+  Value value = op.getArgument(arg);
+  auto type = value.getType().dyn_cast<MemRefType>();
+  assert(type && type.hasStaticShape() && "can only promote memrefs");
+
+  Value attribution =
+      op.addWorkgroupAttribution(type.getShape(), type.getElementType());
+
+  // Replace the uses first since only the original uses are currently present.
+  // Then insert the copies.
+  value.replaceAllUsesWith(attribution);
+  insertCopies(op.getBody(), op.getLoc(), value, attribution);
+}
diff --git a/mlir/lib/IR/Block.cpp b/mlir/lib/IR/Block.cpp
index b0ada9981a8a7..2757c505555a1 100644
--- a/mlir/lib/IR/Block.cpp
+++ b/mlir/lib/IR/Block.cpp
@@ -179,6 +179,20 @@ void Block::eraseArgument(unsigned index, bool updatePredTerms) {
   }
 }
 
+/// Insert one value to the given position of the argument list. The existing
+/// arguments are shifted. The block is expected not to have predecessors.
+BlockArgument Block::insertArgument(args_iterator it, Type type) {
+  assert(llvm::empty(getPredecessors()) &&
+         "cannot insert arguments to blocks with predecessors");
+
+  // Use the args_iterator (on the BlockArgListType) to compute the insertion
+  // iterator in the underlying argument storage.
+  size_t distance = std::distance(args_begin(), it);
+  auto arg = BlockArgument::create(type, this);
+  arguments.insert(std::next(arguments.begin(), distance), arg);
+  return arg;
+}
+
 //===----------------------------------------------------------------------===//
 // Terminator management
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/GPU/promotion.mlir b/mlir/test/Dialect/GPU/promotion.mlir
new file mode 100644
index 0000000000000..c06174e0fcded
--- /dev/null
+++ b/mlir/test/Dialect/GPU/promotion.mlir
@@ -0,0 +1,119 @@
+// RUN: mlir-opt -test-gpu-memory-promotion -split-input-file %s | FileCheck %s
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref3d
+  // CHECK-SAME: (%[[arg:.*]]: memref<5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<5x4xf32, 3>)
+  gpu.func @memref3d(%arg0: memref<5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[arg]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[promoted]][%[[i1]], %[[i2]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<5x4xf32, 3>)
+    "use"(%arg0) : (memref<5x4xf32>) -> ()
+
+
+    // Verify that loops for the copy are emitted. We only check the number of
+    // loops here since their bounds are produced by mapLoopToProcessorIds,
+    // tested separately.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+
+    // Verify that the copy is emitted and uses only the last two loops.
+    // CHECK:       %[[v:.*]] = load %[[promoted]][%[[i1]], %[[i2]]]
+    // CHECK:       store %[[v]], %[[arg]][%[[i1]], %[[i2]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Verify that the attribution was indeed introduced
+  // CHECK-LABEL: @memref5d
+  // CHECK-SAME: (%[[arg:.*]]: memref<8x7x6x5x4xf32>
+  // CHECK-SAME: workgroup(%[[promoted:.*]] : memref<8x7x6x5x4xf32, 3>)
+  gpu.func @memref5d(%arg0: memref<8x7x6x5x4xf32> {gpu.test_promote_workgroup}) kernel {
+    // Verify that loop bounds are emitted, the order does not matter.
+    // CHECK-DAG: %[[c0:.*]] = constant 0
+    // CHECK-DAG: %[[c1:.*]] = constant 1
+    // CHECK-DAG: %[[c4:.*]] = constant 4
+    // CHECK-DAG: %[[c5:.*]] = constant 5
+    // CHECK-DAG: %[[c6:.*]] = constant 6
+    // CHECK-DAG: %[[c7:.*]] = constant 7
+    // CHECK-DAG: %[[c8:.*]] = constant 8
+    // CHECK-DAG: %[[tx:.*]] = "gpu.thread_id"() {dimension = "x"}
+    // CHECK-DAG: %[[ty:.*]] = "gpu.thread_id"() {dimension = "y"}
+    // CHECK-DAG: %[[tz:.*]] = "gpu.thread_id"() {dimension = "z"}
+    // CHECK-DAG: %[[bdx:.*]] = "gpu.block_dim"() {dimension = "x"}
+    // CHECK-DAG: %[[bdy:.*]] = "gpu.block_dim"() {dimension = "y"}
+    // CHECK-DAG: %[[bdz:.*]] = "gpu.block_dim"() {dimension = "z"}
+
+    // Verify that loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+
+    // Verify that the use has been rewritten.
+    // CHECK: "use"(%[[promoted]]) : (memref<8x7x6x5x4xf32, 3>)
+    "use"(%arg0) : (memref<8x7x6x5x4xf32>) -> ()
+
+    // Verify that loop loops for the copy are emitted.
+    // CHECK: loop.for %[[i0:.*]] =
+    // CHECK:   loop.for %[[i1:.*]] =
+    // CHECK:     loop.for %[[i2:.*]] =
+    // CHECK:       loop.for %[[i3:.*]] =
+    // CHECK:         loop.for %[[i4:.*]] =
+
+    // Verify that the copy is emitted.
+    // CHECK:           %[[v:.*]] = load %[[promoted]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    // CHECK:           store %[[v]], %[[arg]][%[[i0]], %[[i1]], %[[i2]], %[[i3]], %[[i4]]]
+    gpu.return
+  }
+}
+
+// -----
+
+module @foo attributes {gpu.kernel_module} {
+  // Check that attribution insertion works fine.
+  // CHECK-LABEL: @insert
+  // CHECK-SAME: (%{{.*}}: memref<4xf32>
+  // CHECK-SAME: workgroup(%{{.*}}: memref<1x1xf64, 3>
+  // CHECK-SAME: %[[wg2:.*]] : memref<4xf32, 3>)
+  // CHECK-SAME: private(%{{.*}}: memref<1x1xi64, 5>)
+  gpu.func @insert(%arg0: memref<4xf32> {gpu.test_promote_workgroup})
+      workgroup(%arg1: memref<1x1xf64, 3>)
+      private(%arg2: memref<1x1xi64, 5>)
+      kernel {
+    // CHECK: "use"(%[[wg2]])
+    "use"(%arg0) : (memref<4xf32>) -> ()
+    gpu.return
+  }
+}
diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt
index b6338e1d167cd..ac4a4930e5a51 100644
--- a/mlir/test/lib/Transforms/CMakeLists.txt
+++ b/mlir/test/lib/Transforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_llvm_library(MLIRTestTransforms
   TestCallGraph.cpp
   TestConstantFold.cpp
   TestLoopFusion.cpp
+  TestGpuMemoryPromotion.cpp
   TestInlining.cpp
   TestLinalgTransforms.cpp
   TestLiveness.cpp
@@ -26,6 +27,8 @@ add_dependencies(MLIRTestTransforms MLIRTestVectorTransformPatternsIncGen)
 target_link_libraries(MLIRTestTransforms
   MLIRAffineOps
   MLIRAnalysis
+  MLIREDSC
+  MLIRGPU
   MLIRLoopOps
   MLIRPass
   MLIRTestDialect
diff --git a/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
new file mode 100644
index 0000000000000..ee0291827fa45
--- /dev/null
+++ b/mlir/test/lib/Transforms/TestGpuMemoryPromotion.cpp
@@ -0,0 +1,40 @@
+//===- TestGPUMemoryPromotionPass.cpp - Test pass for GPU promotion -------===//
+//
+// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the pass testing the utilities for moving data across
+// different levels of the GPU memory hierarchy.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/GPU/GPUDialect.h"
+#include "mlir/Dialect/GPU/MemoryPromotion.h"
+#include "mlir/IR/Attributes.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+
+namespace {
+/// Simple pass for testing the promotion to workgroup memory in GPU functions.
+/// Promotes all arguments with "gpu.test_promote_workgroup" attribute. This
+/// does not check whether the promotion is legal (e.g., amount of memory used)
+/// or beneficial (e.g., makes previously uncoalesced loads coalesced).
+class TestGpuMemoryPromotionPass
+    : public OperationPass<TestGpuMemoryPromotionPass, gpu::GPUFuncOp> {
+  void runOnOperation() override {
+    gpu::GPUFuncOp op = getOperation();
+    for (unsigned i = 0, e = op.getNumArguments(); i < e; ++i) {
+      if (op.getArgAttrOfType<UnitAttr>(i, "gpu.test_promote_workgroup"))
+        promoteToWorkgroupMemory(op, i);
+    }
+  }
+};
+} // end namespace
+
+static PassRegistration<TestGpuMemoryPromotionPass> registration(
+    "test-gpu-memory-promotion",
+    "Promotes the annotated arguments of gpu.func to workgroup memory.");

From 8f1887456ab4ba24a62ccb19d0d04b08972a0289 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <sjoerd.meijer@arm.com>
Date: Thu, 9 Jan 2020 09:14:00 +0000
Subject: [PATCH 31/89] [LV] Still vectorise when tail-folding can't find a
 primary inducation variable

This addresses a vectorisation regression for tail-folded loops that are
counting down, e.g. loops as simple as this:

  void foo(char *A, char *B, char *C, uint32_t N) {
    while (N > 0) {
      *C++ = *A++ + *B++;
       N--;
    }
  }

These are loops that can be vectorised, but when tail-folding is requested, it
can't find a primary induction variable which we do need for predicating the
loop. As a result, the loop isn't vectorised at all, which it is able to do
when tail-folding is not attempted. So, this adds a check for the primary
induction variable where we decide how to lower the scalar epilogue. I.e., when
there isn't a primary induction variable, a scalar epilogue loop is allowed
(i.e. don't request tail-folding) so that vectorisation could still be
triggered.

Having this check for the primary induction variable make sense anyway, and in
addition, in a follow-up of this I will look into discovering earlier the
primary induction variable for counting down loops, so that this can also be
tail-folded.

Differential revision: https://reviews.llvm.org/D72324
---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 63 +++++++++++--------
 .../ARM/tail-folding-counting-down.ll         | 47 ++++++++++++++
 .../tail-folding-counting-down.ll             | 42 +++++++++++++
 3 files changed, 126 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fd30d52a562a2..0400e44dd0ecf 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7502,30 +7502,43 @@ void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
   State.ILV->vectorizeMemoryInstruction(&Instr, &MaskValues);
 }
 
-static ScalarEpilogueLowering
-getScalarEpilogueLowering(Function *F, Loop *L, LoopVectorizeHints &Hints,
-                          ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI,
-                          TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
-                          AssumptionCache *AC, LoopInfo *LI,
-                          ScalarEvolution *SE, DominatorTree *DT,
-                          const LoopAccessInfo *LAI) {
-  ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+// Determine how to lower the scalar epilogue, which depends on 1) optimising
+// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
+// predication, and 4) a TTI hook that analyses whether the loop is suitable
+// for predication.
+static ScalarEpilogueLowering getScalarEpilogueLowering(
+    Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
+    BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
+    AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+    LoopVectorizationLegality &LVL) {
+  bool OptSize =
+      F->hasOptSize() || llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
+                                                     PGSOQueryType::IRPass);
+  // 1) OptSize takes precedence over all other options, i.e. if this is set,
+  // don't look at hints or options, and don't request a scalar epilogue.
+  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+    return CM_ScalarEpilogueNotAllowedOptSize;
+
   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                               !PreferPredicateOverEpilog;
 
-  if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
-      (F->hasOptSize() ||
-       llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI,
-                                   PGSOQueryType::IRPass)))
-    SEL = CM_ScalarEpilogueNotAllowedOptSize;
-  else if (PreferPredicateOverEpilog ||
-           Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
-           (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT, LAI) &&
-            Hints.getPredicate() != LoopVectorizeHints::FK_Disabled &&
-            !PredicateOptDisabled))
-    SEL = CM_ScalarEpilogueNotNeededUsePredicate;
+  // 2) Next, if disabling predication is requested on the command line, honour
+  // this and request a scalar epilogue. Also do this if we don't have a
+  // primary induction variable, which is required for predication.
+  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+    return CM_ScalarEpilogueAllowed;
+
+  // 3) and 4) look if enabling predication is requested on the command line,
+  // with a loop hint, or if the TTI hook indicates this is profitable, request
+  // predication .
+  if (PreferPredicateOverEpilog ||
+      Hints.getPredicate() == LoopVectorizeHints::FK_Enabled ||
+      (TTI->preferPredicateOverEpilogue(L, LI, *SE, *AC, TLI, DT,
+                                        LVL.getLAI()) &&
+       Hints.getPredicate() != LoopVectorizeHints::FK_Disabled))
+    return CM_ScalarEpilogueNotNeededUsePredicate;
 
-  return SEL;
+  return CM_ScalarEpilogueAllowed;
 }
 
 // Process the loop in the VPlan-native vectorization path. This path builds
@@ -7543,9 +7556,8 @@ static bool processLoopInVPlanNativePath(
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
 
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL->getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL);
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
                                 &Hints, IAI);
@@ -7637,9 +7649,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
-  ScalarEpilogueLowering SEL =
-    getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, AC, LI,
-                              PSE.getSE(), DT, LVL.getLAI());
+  ScalarEpilogueLowering SEL = getScalarEpilogueLowering(
+      F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL);
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
new file mode 100644
index 0000000000000..5a3438230a2d5
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -0,0 +1,47 @@
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -disable-mve-tail-predication=false -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) #0 {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+attributes #0 = { nofree norecurse nounwind "target-features"="+armv8.1-m.main,+mve.fp" }
diff --git a/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
new file mode 100644
index 0000000000000..2667bfe68f616
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/tail-folding-counting-down.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -loop-vectorize -prefer-predicate-over-epilog -S | FileCheck %s
+
+; Check that when we can't predicate this loop that it is still vectorised (with
+; an epilogue).
+; TODO: the reason this can't be predicated is because a primary induction
+; variable can't be found (not yet) for this counting down loop. But with that
+; fixed, this should be able to be predicated.
+
+; CHECK-LABEL: vector.body:
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define dso_local void @foo(i8* noalias nocapture readonly %A, i8* noalias nocapture readonly %B, i8* noalias nocapture %C, i32 %N) {
+entry:
+  %cmp6 = icmp eq i32 %N, 0
+  br i1 %cmp6, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %N.addr.010 = phi i32 [ %dec, %while.body ], [ %N, %while.body.preheader ]
+  %C.addr.09 = phi i8* [ %incdec.ptr4, %while.body ], [ %C, %while.body.preheader ]
+  %B.addr.08 = phi i8* [ %incdec.ptr1, %while.body ], [ %B, %while.body.preheader ]
+  %A.addr.07 = phi i8* [ %incdec.ptr, %while.body ], [ %A, %while.body.preheader ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %A.addr.07, i32 1
+  %0 = load i8, i8* %A.addr.07, align 1
+  %incdec.ptr1 = getelementptr inbounds i8, i8* %B.addr.08, i32 1
+  %1 = load i8, i8* %B.addr.08, align 1
+  %add = add i8 %1, %0
+  %incdec.ptr4 = getelementptr inbounds i8, i8* %C.addr.09, i32 1
+  store i8 %add, i8* %C.addr.09, align 1
+  %dec = add i32 %N.addr.010, -1
+  %cmp = icmp eq i32 %dec, 0
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}

From 0541a9d4e7f85c1f08f27fe0c0cae293bbb3903a Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 9 Jan 2020 10:14:30 +0100
Subject: [PATCH 32/89] [DWARFDebugLoc] Tweak error message when resolving
 offset pairs with no base address

The previous message mentioned DW_LLE_offset_pair, but this is
incorrect/confusing because we can get this message even with DWARF4
(which does not use DW_LLE encodings). This happens because DWARF<=4
location entries are "upgraded" to DWARF v5 during parsing.

The new error message refrains from referencing specific constants.
Fixes pr44482.
---
 llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 8aed9ab653a16..0c5f9a9c54ec6 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -79,9 +79,9 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
   }
   case dwarf::DW_LLE_offset_pair: {
     if (!Base) {
-      return createStringError(
-          inconvertibleErrorCode(),
-          "Unable to resolve DW_LLE_offset_pair: base address unknown");
+      return createStringError(inconvertibleErrorCode(),
+                               "Unable to resolve location list offset pair: "
+                               "Base address not defined");
     }
     DWARFAddressRange Range{Base->Address + E.Value0, Base->Address + E.Value1,
                             Base->SectionIndex};

From 1cba2612399eaf5ad67f173ab69db0b1b0328fcb Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 9 Jan 2020 09:21:05 +0000
Subject: [PATCH 33/89] Revert "[ARM][LowOverheadLoops] Update liveness info"

This reverts commit e93e0d413f3afa1df5c5f88df546bebcd1183155.

There's some ordering problems on some on the buildbots which needs
investigating.
---
 llvm/lib/CodeGen/LivePhysRegs.cpp             | 13 ---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp   | 64 ------------
 .../Thumb2/LowOverheadLoops/disjoint-vcmp.mir | 99 ++++++++++---------
 .../dont-remove-loop-update.mir               |  2 +-
 .../dont-remove-loop-update2.mir              |  4 +-
 .../dont-remove-loop-update3.mir              |  2 +-
 .../Thumb2/LowOverheadLoops/loop-guards.ll    | 22 ++---
 .../LowOverheadLoops/mov-lr-terminator.mir    |  6 +-
 .../move-def-before-start.mir                 |  6 +-
 .../LowOverheadLoops/move-start-after-def.mir |  6 +-
 .../LowOverheadLoops/unsafe-cpsr-loop-use.mir | 11 +--
 .../Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir | 10 +-
 .../Thumb2/LowOverheadLoops/vctp-in-vpt.mir   | 10 +-
 .../CodeGen/Thumb2/LowOverheadLoops/while.mir |  9 +-
 .../CodeGen/Thumb2/LowOverheadLoops/wlstp.mir | 24 ++---
 15 files changed, 108 insertions(+), 180 deletions(-)

diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp
index 547970e7ab5d9..7a5cffca34700 100644
--- a/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -276,7 +276,6 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // We walk through the block backwards and start with the live outs.
   LivePhysRegs LiveRegs;
@@ -295,18 +294,6 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
-
-      // Special-case return instructions for cases when a return is not
-      // the last instruction in the block.
-      if (MI.isReturn() && MFI.isCalleeSavedInfoValid()) {
-        for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
-          if (Info.getReg() == Reg) {
-            IsNotLive = !Info.isRestored();
-            break;
-          }
-        }
-      }
-
       MO->setIsDead(IsNotLive);
     }
 
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index d8b6a0e47d047..31a98d86a54d2 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -43,61 +43,6 @@ using namespace llvm;
 
 namespace {
 
-  class PostOrderLoopTraversal {
-    MachineLoop &ML;
-    MachineLoopInfo &MLI;
-    SmallPtrSet<MachineBasicBlock*, 4> Visited;
-    SmallVector<MachineBasicBlock*, 4> Order;
-
-  public:
-    PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
-      : ML(ML), MLI(MLI) { }
-
-    const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
-      return Order;
-    }
-
-    // Visit all the blocks within the loop, as well as exit blocks and any
-    // blocks properly dominating the header.
-    void ProcessLoop() {
-      std::function<void(MachineBasicBlock*)> Search = [this, &Search]
-        (MachineBasicBlock *MBB) -> void {
-        if (Visited.count(MBB))
-          return;
-
-        Visited.insert(MBB);
-        for (auto *Succ : MBB->successors()) {
-          if (!ML.contains(Succ))
-            continue;
-          Search(Succ);
-        }
-        Order.push_back(MBB);
-      };
-
-      // Insert exit blocks.
-      SmallVector<MachineBasicBlock*, 2> ExitBlocks;
-      ML.getExitBlocks(ExitBlocks);
-      for (auto *MBB : ExitBlocks)
-        Order.push_back(MBB);
-
-      // Then add the loop body.
-      Search(ML.getHeader());
-
-      // Then try the preheader and its predecessors.
-      std::function<void(MachineBasicBlock*)> GetPredecessor =
-        [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
-        Order.push_back(MBB);
-        if (MBB->pred_size() == 1)
-          GetPredecessor(*MBB->pred_begin());
-      };
-
-      if (auto *Preheader = ML.getLoopPreheader())
-        GetPredecessor(Preheader);
-      else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
-        GetPredecessor(Preheader);
-    }
-  };
-
   struct PredicatedMI {
     MachineInstr *MI = nullptr;
     SetVector<MachineInstr*> Predicates;
@@ -1031,15 +976,6 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
       ConvertVPTBlocks(LoLoop);
     }
   }
-
-  PostOrderLoopTraversal DFS(*LoLoop.ML, *MLI);
-  DFS.ProcessLoop();
-  const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
-  for (auto *MBB : PostOrder)
-    recomputeLiveIns(*MBB);
-
-  for (auto *MBB : reverse(PostOrder))
-    recomputeLivenessFlags(*MBB);
 }
 
 bool ARMLowOverheadLoops::RevertNonLoops() {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
index aa8954df1c591..cafd1317c57ca 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir
@@ -1,8 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s
-#
 --- |
   @mask = external global i16
+  ; Function Attrs: nofree norecurse nounwind
   define dso_local void @test(i32* noalias nocapture %arg, i32* noalias nocapture readonly %arg1, i32 %arg2, i32* noalias nocapture readonly %arg3) local_unnamed_addr #0 {
   bb:
     %tmp = icmp eq i32 %arg2, 0
@@ -16,6 +16,8 @@
     %mask.load = load i16, i16* %mask.gep9
     %conv.mask = zext i16 %mask.load to i32
     %invariant.mask = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %conv.mask)
+    %mask.insert = insertelement <4 x i32> undef, i32 %conv.mask, i32 0
+    %invariant.limits = shufflevector <4 x i32> %mask.insert, <4 x i32> undef, <4 x i32> zeroinitializer
     br i1 %tmp, label %bb27, label %bb3
 
   bb3:                                              ; preds = %bb
@@ -29,20 +31,18 @@
     %lsr.iv = phi i32* [ %scevgep, %bb9 ], [ %arg, %bb3 ]
     %tmp7 = phi i32 [ %tmp6, %bb3 ], [ %tmp12, %bb9 ]
     %tmp8 = phi i32 [ %arg2, %bb3 ], [ %tmp11, %bb9 ]
+    %lsr.iv47 = bitcast i32* %lsr.iv4 to <4 x i32>*
     %lsr.iv1 = bitcast i32* %lsr.iv to <4 x i32>*
     %lsr.iv24 = bitcast i32* %lsr.iv2 to <4 x i32>*
-    %lsr.iv47 = bitcast i32* %lsr.iv4 to <4 x i32>*
     %vctp = call <4 x i1> @llvm.arm.mve.vctp32(i32 %tmp8)
     %and = and <4 x i1> %vctp, %invariant.mask
     %tmp11 = sub i32 %tmp8, 4
     %tmp17 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv24, i32 4, <4 x i1> %and, <4 x i32> undef)
     %tmp22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1, i32 4, <4 x i1> %and, <4 x i32> undef)
     %tmp23 = mul nsw <4 x i32> %tmp22, %tmp17
-    %scevgep2 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv47, i32 1
-    %load.limits = load <4 x i32>, <4 x i32>* %scevgep2
-    %0 = insertelement <4 x i32> undef, i32 %conv.mask, i32 0
-    %1 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer
-    %bad.icmp = icmp ule <4 x i32> %load.limits, %1
+    %scevgep8 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv47, i32 1
+    %load.limits = load <4 x i32>, <4 x i32>* %scevgep8
+    %bad.icmp = icmp ule <4 x i32> %load.limits, %invariant.limits
     call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp23, <4 x i32>* %lsr.iv1, i32 4, <4 x i1> %bad.icmp)
     %tmp12 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp7, i32 1)
     %tmp13 = icmp ne i32 %tmp12, 0
@@ -54,12 +54,13 @@
   bb27:                                             ; preds = %bb9, %bb
     ret void
   }
-  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>)
-  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
-  declare void @llvm.set.loop.iterations.i32(i32)
-  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
-  declare <4 x i1> @llvm.arm.mve.vctp32(i32)
-  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32)
+  declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #1
+  declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #2
+  declare void @llvm.set.loop.iterations.i32(i32) #3
+  declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #3
+  declare <4 x i1> @llvm.arm.mve.vctp32(i32) #4
+  declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) #4
+  declare void @llvm.stackprotector(i8*, i8**) #5
 
 ...
 ---
@@ -84,7 +85,7 @@ frameInfo:
   hasStackMap:     false
   hasPatchPoint:   false
   stackSize:       20
-  offsetAdjustment: 0
+  offsetAdjustment: -12
   maxAlignment:    4
   adjustsStack:    false
   hasCalls:        false
@@ -109,7 +110,7 @@ stack:
       stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
   - { id: 3, name: '', type: spill-slot, offset: -12, size: 4, alignment: 4,
-      stack-id: default, callee-saved-register: '$r5', callee-saved-restored: true,
+      stack-id: default, callee-saved-register: '$r6', callee-saved-restored: true,
       debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
   - { id: 4, name: '', type: spill-slot, offset: -16, size: 4, alignment: 4,
       stack-id: default, callee-saved-register: '$r4', callee-saved-restored: true,
@@ -121,36 +122,37 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r3, $r2, $r0, $r1, $r4, $r5, $r7, $lr
-  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
+  ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 16
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r5, -12
+  ; CHECK:   frame-setup CFI_INSTRUCTION offset $r6, -12
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r4, -16
+  ; CHECK:   $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
+  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa $r7, 8
   ; CHECK:   $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
-  ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 20
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r3, $r2, $r0, $r1
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
   ; CHECK:   $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
-  ; CHECK:   renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
+  ; CHECK:   renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
   ; CHECK:   $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
-  ; CHECK:   renamable $r4 = t2BICri killed renamable $r4, 3, 14, $noreg, $noreg
-  ; CHECK:   renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
-  ; CHECK:   renamable $r12 = t2SUBri killed renamable $r4, 4, 14, $noreg, $noreg
+  ; CHECK:   renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
+  ; CHECK:   renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+  ; CHECK:   renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
   ; CHECK:   renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
-  ; CHECK:   $vpr = VMSR_P0 $r5, 14, $noreg
-  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14, $noreg, $noreg
+  ; CHECK:   $vpr = VMSR_P0 $r12, 14, $noreg
+  ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
+  ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
-  ; CHECK:   renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = t2DLS renamable $lr
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r2, $r12, $r0, $q0, $r1, $r3
+  ; CHECK:   liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 2, implicit $vpr
   ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
@@ -158,26 +160,28 @@ body:             |
   ; CHECK:   renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-  ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8)
-  ; CHECK:   MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr
+  ; CHECK:   renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
+  ; CHECK:   renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
+  ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
-  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
+  ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
   bb.0.bb:
     successors: %bb.3(0x30000000), %bb.1(0x50000000)
-    liveins: $r0, $r1, $r2, $r3, $r4, $r5, $r7, $lr
+    liveins: $r0, $r1, $r2, $r3, $r4, $r6, $lr
 
-    frame-setup tPUSH 14, $noreg, killed $r4, killed $r5, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup tPUSH 14, $noreg, killed $r4, killed $r6, killed $lr, implicit-def $sp, implicit $sp
     frame-setup CFI_INSTRUCTION def_cfa_offset 16
     frame-setup CFI_INSTRUCTION offset $lr, -4
     frame-setup CFI_INSTRUCTION offset $r7, -8
-    frame-setup CFI_INSTRUCTION offset $r5, -12
+    frame-setup CFI_INSTRUCTION offset $r6, -12
     frame-setup CFI_INSTRUCTION offset $r4, -16
+    $r7 = frame-setup tADDrSPi $sp, 2, 14, $noreg
+    frame-setup CFI_INSTRUCTION def_cfa $r7, 8
     $sp = frame-setup tSUBspi $sp, 1, 14, $noreg
-    frame-setup CFI_INSTRUCTION def_cfa_offset 20
     tCBZ $r2, %bb.3
 
   bb.1.bb3:
@@ -185,17 +189,17 @@ body:             |
     liveins: $r0, $r1, $r2, $r3
 
     $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14, $noreg
-    renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14, $noreg
+    renamable $lr = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
     $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14, $noreg
-    renamable $r4 = t2BICri killed renamable $r4, 3, 14, $noreg, $noreg
-    renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
-    renamable $r12 = t2SUBri killed renamable $r4, 4, 14, $noreg, $noreg
+    renamable $lr = t2BICri killed renamable $lr, 3, 14, $noreg, $noreg
+    renamable $r12 = t2LDRHi12 killed renamable $r12, 0, 14, $noreg :: (dereferenceable load 2 from %ir.mask.gep9)
+    renamable $lr = t2SUBri killed renamable $lr, 4, 14, $noreg, $noreg
     renamable $r4, dead $cpsr = tMOVi8 1, 14, $noreg
-    $vpr = VMSR_P0 $r5, 14, $noreg
-    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14, $noreg, $noreg
+    $vpr = VMSR_P0 $r12, 14, $noreg
+    renamable $q0 = MVE_VDUP32 killed renamable $r12, 0, $noreg, undef renamable $q0
     renamable $r12 = t2SUBri killed renamable $r3, 16, 14, $noreg, $noreg
+    renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $lr, 19, 14, $noreg, $noreg
     VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
-    renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0
     $r3 = tMOVr $r0, 14, $noreg
     t2DoLoopStart renamable $lr
 
@@ -210,8 +214,9 @@ body:             |
     renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4)
     renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14, $noreg
     renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1
-    renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8)
-    MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr
+    renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep8, align 8)
+    renamable $vpr = MVE_VCMPu32 renamable $q0, killed renamable $q2, 2, 0, $noreg
+    MVE_VPST 8, implicit $vpr
     MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4)
     renamable $lr = t2LoopDec killed renamable $lr, 1
     $r0 = tMOVr $r3, 14, $noreg
@@ -220,6 +225,6 @@ body:             |
 
   bb.3.bb27:
     $sp = tADDspi $sp, 1, 14, $noreg
-    tPOP_RET 14, $noreg, def $r4, def $r5, def $r7, def $pc
+    tPOP_RET 14, $noreg, def $r4, def $r6, def $r7, def $pc
 
 ...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
index 414ccd94804bd..976c5f5d7ba36 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@@ -6,7 +6,7 @@
 # CHECK:  bb.1.vector.body:
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
+# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
index 68f245f2ed910..9e429040db4fd 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
@@ -7,8 +7,8 @@
 # We shouldn't optimise away the SUB.
 
 # CHECK:  bb.1.vector.body:
-# CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
+# CHECK:    renamable $r3, $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
+# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
index e99cf7ad631ec..ab7fcf843d7dc 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
@@ -8,7 +8,7 @@
 
 # CHECK:  bb.1.vector.body:
 # CHECK:    renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14, $noreg
-# CHECK:    $lr = MVE_LETP killed renamable $lr, %bb.1
+# CHECK:    $lr = MVE_LETP renamable $lr, %bb.1
 
 --- |
   target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
index f23b64013c355..fd8cc9b92f2fa 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll
@@ -18,9 +18,9 @@
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
 define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -51,9 +51,9 @@ if.end:                                           ; preds = %while.body, %entry
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
 define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -86,9 +86,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
 define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %brmerge.demorgan = and i1 %t1, %t2
@@ -121,9 +121,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK:   t2CMPri renamable $lr, 0
 ; CHECK:   tBcc %bb.3
 ; CHECK: bb.1.while.body.preheader:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS renamable $lr
 ; CHECK: bb.2.while.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
 define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %cmp = icmp ne i32 %N, 0
@@ -152,9 +152,9 @@ if.end:                                           ; preds = %while.body, %while.
 ; CHECK: be_ne
 ; CHECK: body:
 ; CHECK: bb.0.entry:
-; CHECK:   $lr = t2DLS killed renamable $lr
+; CHECK:   $lr = t2DLS renamable $lr
 ; CHECK: bb.1.do.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.1
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.1
 define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   %cmp = icmp ne i32 %N, 0
@@ -187,7 +187,7 @@ if.end:                                           ; preds = %do.body, %entry
 ; CHECK: bb.1.do.body.preheader:
 ; CHECK:   $lr = tMOVr
 ; CHECK: bb.2.do.body:
-; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
 define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) {
 entry:
   br label %do.body.preheader
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
index ef8e726eeaed8..57fe0492f1eb2 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-lr-terminator.mir
@@ -100,7 +100,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -114,7 +114,7 @@ body:             |
   ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -123,7 +123,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
index a4fceb6f54ae0..ff49bb0770eec 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-def-before-start.mir
@@ -107,7 +107,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -122,7 +122,7 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tMOVi8 0, 14, $noreg
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -131,7 +131,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
index 77c70ab4d22b3..3fb203ee19389 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/move-start-after-def.mir
@@ -105,7 +105,7 @@ body:             |
   ; CHECK-LABEL: name: start_before_elems
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r1, $r0, $r2, $r3, $r4, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r4, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r4, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -120,7 +120,7 @@ body:             |
   ; CHECK:   $lr = MVE_DLSTP_32 renamable $r12
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r12, $r2, $r3, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r1, renamable $r3, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4, dead $cpsr = tADDrr renamable $r2, renamable $r3, 14, $noreg
@@ -129,7 +129,7 @@ body:             |
   ; CHECK:   renamable $q1 = MVE_VLDRBU32 killed renamable $r4, 0, 0, $noreg :: (load 4 from %ir.scevgep23, align 1)
   ; CHECK:   renamable $q0 = nuw nsw MVE_VMULi32 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   renamable $r0 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r0, 16, 0, killed $noreg :: (store 16 into %ir.lsr.iv1, align 4)
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
index f8e5351d4e6c7..5f4a10249684b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/unsafe-cpsr-loop-use.mir
@@ -94,7 +94,7 @@ body:             |
   ; CHECK-LABEL: name: do_copy
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x80000000)
-  ; CHECK:   liveins: $r2, $r1, $r7, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -102,19 +102,18 @@ body:             |
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi3 killed renamable $r1, 4, 14, $noreg
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi3 killed renamable $r2, 4, 14, $noreg
   ; CHECK:   renamable $r2 = t2LSRri renamable $r0, 1, 14, $noreg, $noreg
-  ; CHECK:   $lr = tMOVr $r0, 14, $noreg
+  ; CHECK:   $lr = tMOVr killed $r0, 14, $noreg
   ; CHECK: bb.1.while.body:
   ; CHECK:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
-  ; CHECK:   liveins: $lr, $r0, $r2, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
   ; CHECK:   renamable $r3, renamable $r1 = t2LDR_PRE killed renamable $r1, 4, 14, $noreg :: (load 4 from %ir.scevgep7)
   ; CHECK:   tCMPhir renamable $lr, renamable $r2, 14, $noreg, implicit-def $cpsr
   ; CHECK:   $lr = t2SUBri killed renamable $lr, 1, 14, $noreg, $noreg
   ; CHECK:   t2IT 2, 8, implicit-def $itstate
-  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit killed renamable $r3, implicit killed $itstate
+  ; CHECK:   renamable $r3 = tLSRri $noreg, killed renamable $r3, 1, 2, killed $cpsr, implicit renamable $r3, implicit killed $itstate
   ; CHECK:   early-clobber renamable $r0 = t2STR_PRE killed renamable $r3, killed renamable $r0, 4, 14, $noreg :: (store 4 into %ir.scevgep4)
-  ; CHECK:   renamable $lr = tMOVr killed $lr, 14, $noreg
   ; CHECK:   t2CMPri $lr, 0, 14, $noreg, implicit-def $cpsr
-  ; CHECK:   tBcc %bb.1, 1, killed $cpsr
+  ; CHECK:   tBcc %bb.1, 1, $cpsr
   ; CHECK:   tB %bb.2, 14, $noreg
   ; CHECK: bb.2.while.end:
   ; CHECK:   $r0, dead $cpsr = tMOVi8 0, 14, $noreg
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
index a8d0b500af1e9..83dc8731f6542 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir
@@ -134,7 +134,7 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r1, $r2, $r0, $r3, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -145,7 +145,7 @@ body:             |
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r1, $r2, $r0, $r3
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r12 = t2ADDri renamable $r2, 3, 14, $noreg, $noreg
   ; CHECK:   renamable $lr = t2MOVi 1, 14, $noreg, $noreg
   ; CHECK:   renamable $r12 = t2BICri killed renamable $r12, 3, 14, $noreg, $noreg
@@ -154,10 +154,10 @@ body:             |
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
   ; CHECK:   renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14, $noreg, $noreg
-  ; CHECK:   $lr = t2DLS killed renamable $lr
+  ; CHECK:   $lr = t2DLS renamable $lr
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r2, $r3, $r0, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr
@@ -171,7 +171,7 @@ body:             |
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
index 00bb972580a1c..fd6345693e835 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt.mir
@@ -131,7 +131,7 @@ body:             |
   ; CHECK-LABEL: name: test
   ; CHECK: bb.0.bb:
   ; CHECK:   successors: %bb.3(0x30000000), %bb.1(0x50000000)
-  ; CHECK:   liveins: $r0, $r2, $r1, $r3, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -142,14 +142,14 @@ body:             |
   ; CHECK:   tCBZ $r2, %bb.3
   ; CHECK: bb.1.bb3:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r2, $r1, $r3
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3
   ; CHECK:   $vpr = VMSR_P0 killed $r3, 14, $noreg
   ; CHECK:   VSTR_P0_off killed renamable $vpr, $sp, 0, 14, $noreg :: (store 4 into %stack.0)
   ; CHECK:   $r3 = tMOVr $r0, 14, $noreg
-  ; CHECK:   $lr = MVE_DLSTP_32 killed renamable $r2
+  ; CHECK:   $lr = MVE_DLSTP_32 renamable $r2
   ; CHECK: bb.2.bb9:
   ; CHECK:   successors: %bb.2(0x7c000000), %bb.3(0x04000000)
-  ; CHECK:   liveins: $lr, $r3, $r1, $r0
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $vpr = VLDR_P0_off $sp, 0, 14, $noreg :: (load 4 from %stack.0)
   ; CHECK:   MVE_VPST 4, implicit $vpr
   ; CHECK:   renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4, !tbaa !3)
@@ -158,7 +158,7 @@ body:             |
   ; CHECK:   MVE_VPST 8, implicit $vpr
   ; CHECK:   MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4, !tbaa !3)
   ; CHECK:   $r0 = tMOVr $r3, 14, $noreg
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
   ; CHECK: bb.3.bb27:
   ; CHECK:   $sp = tADDspi $sp, 1, 14, $noreg
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
index 78ec447c37c17..1ed9339a9f62b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/while.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s
+
 # TODO: Remove the lr = tMOVr
 
 --- |
@@ -90,15 +91,15 @@ body:             |
   ; CHECK-LABEL: name: copy
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.1(0x40000000), %bb.3(0x40000000)
-  ; CHECK:   liveins: $r0, $r2, $r1, $r7, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $r7, -8
-  ; CHECK:   dead $lr = t2WLS $r2, %bb.3
+  ; CHECK:   $lr = t2WLS $r2, %bb.3
   ; CHECK: bb.1.while.body.preheader:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r0, $r2, $r1
+  ; CHECK:   liveins: $r0, $r1, $r2
   ; CHECK:   renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 2, 14, $noreg
   ; CHECK:   renamable $r0, dead $cpsr = tSUBi8 killed renamable $r0, 2, 14, $noreg
   ; CHECK:   $lr = tMOVr killed $r2, 14, $noreg
@@ -107,7 +108,7 @@ body:             |
   ; CHECK:   liveins: $lr, $r0, $r1
   ; CHECK:   renamable $r2, renamable $r1 = t2LDRH_PRE killed renamable $r1, 2, 14, $noreg :: (load 2 from %ir.scevgep4)
   ; CHECK:   early-clobber renamable $r0 = t2STRH_PRE killed renamable $r2, killed renamable $r0, 2, 14, $noreg :: (store 2 into %ir.scevgep7)
-  ; CHECK:   $lr = t2LEUpdate killed renamable $lr, %bb.2
+  ; CHECK:   $lr = t2LEUpdate renamable $lr, %bb.2
   ; CHECK: bb.3.while.end:
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
   bb.0.entry:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
index 44a072ac291b3..33389f4c2941c 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/wlstp.mir
@@ -199,11 +199,11 @@ body:             |
   ; CHECK:   tB %bb.3, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $lr, $r1, $r2, $r3, $r0
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $r12 = t2MOVi 0, 14, $noreg, $noreg
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  ; CHECK:   liveins: $lr, $r12, $r2, $r3, $r0, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3, $r12
   ; CHECK:   renamable $r4 = t2ADDrr renamable $r1, renamable $r12, 14, $noreg, $noreg
   ; CHECK:   renamable $q0 = MVE_VLDRBU8 killed renamable $r4, 0, 0, $noreg :: (load 16 from %ir.scevgep45, align 1)
   ; CHECK:   renamable $r4 = t2ADDrr renamable $r2, renamable $r12, 14, $noreg, $noreg
@@ -213,7 +213,7 @@ body:             |
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 16, 14, $noreg
   ; CHECK:   renamable $q0 = MVE_VMULi8 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
   ; CHECK:   MVE_VSTRBU8 killed renamable $q0, killed renamable $r4, 0, 0, killed $noreg :: (store 16 into %ir.scevgep1, align 1)
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
   ; CHECK: bb.3.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r4, def $pc
   bb.0.entry:
@@ -313,7 +313,7 @@ body:             |
   ; CHECK-LABEL: name: test_wlstp16
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.2(0x40000000), %bb.1(0x40000000)
-  ; CHECK:   liveins: $r1, $r3, $r0, $r2, $r7, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r3, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -322,7 +322,7 @@ body:             |
   ; CHECK:   tB %bb.2, 14, $noreg
   ; CHECK: bb.1.vector.body:
   ; CHECK:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
-  ; CHECK:   liveins: $lr, $r3, $r0, $r2, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2, $r3
   ; CHECK:   renamable $q0 = MVE_VLDRHU16 renamable $r1, 0, 0, $noreg :: (load 16 from %ir.lsr.iv57, align 2)
   ; CHECK:   renamable $q1 = MVE_VLDRHU16 renamable $r2, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 2)
   ; CHECK:   renamable $q0 = MVE_VMULi16 killed renamable $q1, killed renamable $q0, 0, $noreg, undef renamable $q0
@@ -331,7 +331,7 @@ body:             |
   ; CHECK:   renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 16, 14, $noreg
   ; CHECK:   renamable $r0, dead $cpsr = tADDi8 killed renamable $r0, 16, 14, $noreg
   ; CHECK:   renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 8, 14, $noreg
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.1
   ; CHECK: bb.2.for.cond.cleanup:
   ; CHECK:   tPOP_RET 14, $noreg, def $r7, def $pc
   bb.0.entry:
@@ -422,7 +422,7 @@ body:             |
   ; CHECK-LABEL: name: test_wlstp32
   ; CHECK: bb.0.entry:
   ; CHECK:   successors: %bb.4(0x40000000), %bb.1(0x40000000)
-  ; CHECK:   liveins: $r1, $r2, $r0, $r7, $lr
+  ; CHECK:   liveins: $r0, $r1, $r2, $r7, $lr
   ; CHECK:   frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
   ; CHECK:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
   ; CHECK:   frame-setup CFI_INSTRUCTION offset $lr, -4
@@ -432,12 +432,12 @@ body:             |
   ; CHECK:   tB %bb.4, 14, $noreg
   ; CHECK: bb.1.vector.ph:
   ; CHECK:   successors: %bb.2(0x80000000)
-  ; CHECK:   liveins: $r2, $r0, $lr, $r1
+  ; CHECK:   liveins: $lr, $r0, $r1, $r2
   ; CHECK:   renamable $q1 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q1
   ; CHECK: bb.2.vector.body:
   ; CHECK:   successors: %bb.3(0x04000000), %bb.2(0x7c000000)
-  ; CHECK:   liveins: $r2, $r0, $lr, $r1, $q1
-  ; CHECK:   $q0 = MVE_VORR killed $q1, killed $q1, 0, $noreg, undef $q0
+  ; CHECK:   liveins: $lr, $q1, $r0, $r1, $r2
+  ; CHECK:   $q0 = MVE_VORR killed $q1, $q1, 0, $noreg, undef $q0
   ; CHECK:   renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, $noreg :: (load 16 from %ir.lsr.iv24, align 4)
   ; CHECK:   renamable $q2 = MVE_VLDRWU32 renamable $r1, 0, 0, killed $noreg :: (load 16 from %ir.lsr.iv1, align 4)
   ; CHECK:   $r3 = tMOVr $r2, 14, $noreg
@@ -446,10 +446,10 @@ body:             |
   ; CHECK:   renamable $r1, dead $cpsr = tADDi8 killed renamable $r1, 16, 14, $noreg
   ; CHECK:   renamable $r2, dead $cpsr = tSUBi8 killed $r2, 4, 14, $noreg
   ; CHECK:   renamable $q1 = nsw MVE_VADDi32 killed renamable $q1, renamable $q0, 0, $noreg, undef renamable $q1
-  ; CHECK:   $lr = MVE_LETP killed renamable $lr, %bb.2
+  ; CHECK:   $lr = MVE_LETP renamable $lr, %bb.2
   ; CHECK: bb.3.middle.block:
   ; CHECK:   successors: %bb.4(0x80000000)
-  ; CHECK:   liveins: $q1, $q0, $r3
+  ; CHECK:   liveins: $q0, $q1, $r3
   ; CHECK:   renamable $vpr = MVE_VCTP32 killed renamable $r3, 0, $noreg
   ; CHECK:   renamable $q0 = MVE_VPSEL killed renamable $q1, killed renamable $q0, 0, killed renamable $vpr
   ; CHECK:   renamable $r12 = MVE_VADDVu32no_acc killed renamable $q0, 0, $noreg

From 459ad8e97e07c823181cd457f2c2ac08f3896e37 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 9 Jan 2020 09:47:26 +0000
Subject: [PATCH 34/89] [Matrix] Implement back-propagation of shape
 information.

This patch extends the shape propagation for matrix operations to also
propagate the shape of instructions to their operands.

Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D70899
---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  64 ++++++++-
 .../propagate-backward.ll                     |  87 +++++++++++
 .../propagate-backwards-unsupported.ll        | 135 ++++++++++++++++++
 3 files changed, 285 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index a9566422a8324..c39fdac93b987 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -402,9 +402,71 @@ class LowerMatrixIntrinsics {
     }
   }
 
+  /// Propagate the shape to operands of instructions with shape information.
+  void propagateShapeBackward() {
+    SmallVector<Value *, 8> WorkList;
+    // Worklist contains instruction for which we already know the shape.
+    for (auto &V : ShapeMap)
+      WorkList.push_back(V.first);
+
+    // Pop an element with known shape.  Traverse the operands, if their shape
+    // derives from the result shape and is unknown, add it and add them to the
+    // worklist.
+    LLVM_DEBUG(dbgs() << "Backward-propagate shapes:\n");
+    while (!WorkList.empty()) {
+      Value *V = WorkList.back();
+      WorkList.pop_back();
+
+      if (!isa<Instruction>(V))
+        continue;
+
+      Value *MatrixA;
+      Value *MatrixB;
+      Value *M;
+      Value *N;
+      Value *K;
+      if (match(V, m_Intrinsic<Intrinsic::matrix_multiply>(
+                       m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
+                       m_Value(N), m_Value(K)))) {
+        if (setShapeInfo(MatrixA, {M, N}))
+          WorkList.push_back(MatrixA);
+
+        if (setShapeInfo(MatrixB, {N, K}))
+          WorkList.push_back(MatrixB);
+
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
+                              m_Value(MatrixA), m_Value(M), m_Value(N)))) {
+        // Flip dimensions.
+        if (setShapeInfo(MatrixA, {M, N}))
+          WorkList.push_back(MatrixA);
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(),
+                              m_Value(M), m_Value(N)))) {
+        if (setShapeInfo(MatrixA, {M, N})) {
+          WorkList.push_back(MatrixA);
+        }
+      } else if (isa<LoadInst>(V) ||
+                 match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
+        // Nothing to do, no matrix input.
+      } else if (isa<StoreInst>(V)) {
+        // Nothing to do.  We forward-propagated to this so we would just
+        // backward propagate to an instruction with an already known shape.
+      } else if (isUniformShape(V)) {
+        // Propagate to all operands.
+        ShapeInfo Shape = ShapeMap[V];
+        for (Use &U : cast<Instruction>(V)->operands()) {
+          if (setShapeInfo(U.get(), Shape))
+            WorkList.push_back(U.get());
+        }
+      }
+    }
+  }
+
   bool Visit() {
-    if (EnableShapePropagation)
+    if (EnableShapePropagation) {
       propagateShapeForward();
+      propagateShapeBackward();
+    }
 
     ReversePostOrderTraversal<Function *> RPOT(&Func);
     bool Changed = false;
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
new file mode 100644
index 0000000000000..f4d7c0e5120e7
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck  %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+define <8 x double> @fadd_transpose(<8 x double> %a, <8 x double> %b) {
+; CHECK-LABEL: @fadd_transpose(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+entry:
+  %add = fadd <8 x double> %a, %b
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %add, i32 2, i32 4)
+  ret <8 x double> %c
+}
+
+define <8 x double> @load_fadd_transpose(<8 x double>* %A.Ptr, <8 x double> %b) {
+; CHECK-LABEL: @load_fadd_transpose(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A:%.*]] = load <8 x double>, <8 x double>* [[A_PTR:%.*]]
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x double> [[TMP5]], double [[TMP6]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x double> [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <4 x double> [[TMP7]], double [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <4 x double> [[TMP9]], double [[TMP10]], i64 3
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x double> [[TMP0]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <4 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x double> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <4 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <2 x double> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <2 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 3
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> [[TMP19]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[TMP20]]
+;
+entry:
+  %a = load <8 x double>, <8 x double>* %A.Ptr
+  %add = fadd <8 x double> %a, %b
+  %c  = call <8 x double> @llvm.matrix.transpose(<8 x double> %add, i32 2, i32 4)
+  ret <8 x double> %c
+}
+
+declare <8 x double> @llvm.matrix.transpose(<8 x double>, i32, i32)
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
new file mode 100644
index 0000000000000..591cddd261cf6
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backwards-unsupported.ll
@@ -0,0 +1,135 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+; Check that we we use flattened vectors for PHI operands and extract the columns afterwards.
+define <9 x double> @unsupported_phi(i1 %cond, <9 x double> %A, <9 x double> %B, <9 x double> %C) {
+; CHECK-LABEL: @unsupported_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+
+; CHECK-LABEL: if.then:
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> [[TMP11]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <6 x double> [[TMP18]], <6 x double> [[TMP19]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+
+; CHECK-LABEL: if.else:
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP21:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <3 x double> undef, double [[TMP21]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP24:%.*]] = insertelement <3 x double> [[TMP22]], double [[TMP23]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = insertelement <3 x double> [[TMP24]], double [[TMP25]], i64 2
+; CHECK-NEXT:    [[TMP27:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <3 x double> undef, double [[TMP27]], i64 0
+; CHECK-NEXT:    [[TMP29:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = insertelement <3 x double> [[TMP28]], double [[TMP29]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP32:%.*]] = insertelement <3 x double> [[TMP30]], double [[TMP31]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <3 x double> undef, double [[TMP33]], i64 0
+; CHECK-NEXT:    [[TMP35:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP36:%.*]] = insertelement <3 x double> [[TMP34]], double [[TMP35]], i64 1
+; CHECK-NEXT:    [[TMP37:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = insertelement <3 x double> [[TMP36]], double [[TMP37]], i64 2
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <3 x double> [[TMP26]], <3 x double> [[TMP32]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP40:%.*]] = shufflevector <3 x double> [[TMP38]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP41:%.*]] = shufflevector <6 x double> [[TMP39]], <6 x double> [[TMP40]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    br label [[IF_END]]
+
+; CHECK-LABEL: if.end:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi <9 x double> [ [[TMP20]], [[IF_THEN]] ], [ [[TMP41]], [[IF_ELSE]] ]
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <9 x double> [[C:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT9:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT10:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT11:%.*]] = shufflevector <9 x double> [[MERGE]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[SPLIT6]], <3 x double> undef, <1 x i32> zeroinitializer
+;
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:                    ; preds = %entry
+  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
+  br label %if.end
+
+if.else:                                       ; preds = %entry
+  %B.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %B, i32 3, i32 3)
+  br label %if.end
+
+if.end:                                        ; preds = %if.then, %if.else
+  %merge = phi <9 x double> [ %A.trans, %if.then], [ %B.trans, %if.else ]
+  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %C, <9 x double> %merge, i32 3, i32 3, i32 3)
+  ret <9 x double> %res
+}
+
+; Make sure we use a flattened vector when calling @foo and the use its flat vector result properly.
+define <9 x double> @unsupported_call(i1 %cond, <9 x double> %A, <9 x double> %B) {
+; CHECK-LABEL: @unsupported_call(
+; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP1:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <3 x double> undef, double [[TMP1]], i64 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <3 x double> [[TMP2]], double [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <3 x double> [[TMP4]], double [[TMP5]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <3 x double> undef, double [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <3 x double> [[TMP8]], double [[TMP9]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP12:%.*]] = insertelement <3 x double> [[TMP10]], double [[TMP11]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <3 x double> undef, double [[TMP13]], i64 0
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <3 x double> [[TMP14]], double [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP17:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = insertelement <3 x double> [[TMP16]], double [[TMP17]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = shufflevector <3 x double> [[TMP6]], <3 x double> [[TMP12]], <6 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <3 x double> [[TMP18]], <3 x double> undef, <6 x i32> <i32 0, i32 1, i32 2, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[TMP21:%.*]] = shufflevector <6 x double> [[TMP19]], <6 x double> [[TMP20]], <9 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[A_FOO:%.*]] = call <9 x double> @foo(<9 x double> [[TMP21]])
+; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B:%.*]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
+; CHECK-NEXT:    [[SPLIT8:%.*]] = shufflevector <9 x double> [[A_FOO]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+;
+  %A.trans = tail call <9 x double> @llvm.matrix.transpose.v9f64(<9 x double> %A, i32 3, i32 3)
+  %A.foo = call <9 x double> @foo(<9 x double> %A.trans)
+  %res = tail call <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double> %B, <9 x double> %A.foo, i32 3, i32 3, i32 3)
+  ret <9 x double> %res
+}
+
+declare <9 x double> @llvm.matrix.multiply.v9f64.v9f64.v9f64(<9 x double>, <9 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <9 x double> @llvm.matrix.transpose.v9f64(<9 x double>, i32 immarg, i32 immarg)
+declare <9 x double> @foo(<9 x double>)

From f0abe820eebf47a3e9b9d0daf9f995d65db186bc Mon Sep 17 00:00:00 2001
From: Evgeniy Brevnov <evgueni.brevnov@gmail.com>
Date: Tue, 31 Dec 2019 13:33:44 +0700
Subject: [PATCH 35/89] [LoopUtils][NFC] Minor refactoring in
 getLoopEstimatedTripCount.

---
 llvm/include/llvm/Support/MathExtras.h  |  5 +++++
 llvm/lib/Transforms/Utils/LoopUtils.cpp | 14 +++++++-------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/Support/MathExtras.h b/llvm/include/llvm/Support/MathExtras.h
index 004a6f5f6eb80..37b9669cbeed9 100644
--- a/llvm/include/llvm/Support/MathExtras.h
+++ b/llvm/include/llvm/Support/MathExtras.h
@@ -732,6 +732,11 @@ inline uint64_t divideCeil(uint64_t Numerator, uint64_t Denominator) {
   return alignTo(Numerator, Denominator) / Denominator;
 }
 
+/// Returns the integer nearest(Numerator / Denominator).
+inline uint64_t divideNearest(uint64_t Numerator, uint64_t Denominator) {
+  return (Numerator + (Denominator / 2)) / Denominator;
+}
+
 /// Returns the largest uint64_t less than or equal to \p Value and is
 /// \p Skew mod \p Align. \p Align must be non-zero
 inline uint64_t alignDown(uint64_t Value, uint64_t Align, uint64_t Skew = 0) {
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index b7dd3d75e4580..c4c40189fda46 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -714,19 +714,19 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
-  uint64_t TrueVal, FalseVal;
-  if (!LatchBR->extractProfMetadata(TrueVal, FalseVal))
+  uint64_t BackedgeTakenWeight, LatchExitWeight;
+  if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
     return None;
 
-  if (!TrueVal || !FalseVal)
+  if (LatchBR->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  if (!BackedgeTakenWeight || !LatchExitWeight)
     return 0;
 
   // Divide the count of the backedge by the count of the edge exiting the loop,
   // rounding to nearest.
-  if (LatchBR->getSuccessor(0) == L->getHeader())
-    return (TrueVal + (FalseVal / 2)) / FalseVal;
-  else
-    return (FalseVal + (TrueVal / 2)) / TrueVal;
+  return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
 }
 
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,

From 00c6e98409f98c3093aab4b1bfbc25c5b54731d0 Mon Sep 17 00:00:00 2001
From: "Kazushi (Jam) Marukawa" <marukawa@nec.com>
Date: Thu, 9 Jan 2020 09:59:37 +0100
Subject: [PATCH 36/89] [VE] Target stub for NEC SX-Aurora

Summary:
This patch registers the 've' target: the NEC SX-Aurora TSUBASA Vector Engine.

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D69103
---
 llvm/CODE_OWNERS.TXT                          |  4 ++
 llvm/include/llvm/ADT/Triple.h                |  8 ++-
 llvm/lib/Support/Triple.cpp                   | 11 ++++
 llvm/lib/Target/LLVMBuild.txt                 |  1 +
 llvm/lib/Target/VE/CMakeLists.txt             |  8 +++
 llvm/lib/Target/VE/LLVMBuild.txt              | 33 ++++++++++
 .../lib/Target/VE/MCTargetDesc/CMakeLists.txt |  3 +
 llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt | 22 +++++++
 .../Target/VE/MCTargetDesc/VEMCTargetDesc.cpp | 19 ++++++
 .../Target/VE/MCTargetDesc/VEMCTargetDesc.h   | 27 ++++++++
 llvm/lib/Target/VE/TargetInfo/CMakeLists.txt  |  3 +
 llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt   | 22 +++++++
 .../lib/Target/VE/TargetInfo/VETargetInfo.cpp | 23 +++++++
 llvm/lib/Target/VE/VE.h                       | 19 ++++++
 llvm/lib/Target/VE/VETargetMachine.cpp        | 62 +++++++++++++++++++
 llvm/lib/Target/VE/VETargetMachine.h          | 31 ++++++++++
 llvm/test/CodeGen/VE/lit.local.cfg            |  2 +
 llvm/test/CodeGen/VE/target_support.ll        |  2 +
 llvm/unittests/ADT/TripleTest.cpp             |  8 +++
 19 files changed, 307 insertions(+), 1 deletion(-)
 create mode 100644 llvm/lib/Target/VE/CMakeLists.txt
 create mode 100644 llvm/lib/Target/VE/LLVMBuild.txt
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
 create mode 100644 llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
 create mode 100644 llvm/lib/Target/VE/TargetInfo/CMakeLists.txt
 create mode 100644 llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt
 create mode 100644 llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
 create mode 100644 llvm/lib/Target/VE/VE.h
 create mode 100644 llvm/lib/Target/VE/VETargetMachine.cpp
 create mode 100644 llvm/lib/Target/VE/VETargetMachine.h
 create mode 100644 llvm/test/CodeGen/VE/lit.local.cfg
 create mode 100644 llvm/test/CodeGen/VE/target_support.ll

diff --git a/llvm/CODE_OWNERS.TXT b/llvm/CODE_OWNERS.TXT
index df8aa0b4ef9d2..457dabe39f90c 100644
--- a/llvm/CODE_OWNERS.TXT
+++ b/llvm/CODE_OWNERS.TXT
@@ -150,6 +150,10 @@ N: Dylan McKay
 E: me@dylanmckay.io
 D: AVR Backend
 
+N: Simon Moll
+E: simon.moll@emea.nec.com
+D: VE Backend
+
 N: Tim Northover
 E: t.p.northover@gmail.com
 D: AArch64 backend, misc ARM backend
diff --git a/llvm/include/llvm/ADT/Triple.h b/llvm/include/llvm/ADT/Triple.h
index 88a86bfa5ff3a..76a754d671fb6 100644
--- a/llvm/include/llvm/ADT/Triple.h
+++ b/llvm/include/llvm/ADT/Triple.h
@@ -95,7 +95,8 @@ class Triple {
     wasm64,         // WebAssembly with 64-bit pointers
     renderscript32, // 32-bit RenderScript
     renderscript64, // 64-bit RenderScript
-    LastArchType = renderscript64
+    ve,             // NEC SX-Aurora Vector Engine
+    LastArchType = ve
   };
   enum SubArchType {
     NoSubArch,
@@ -735,6 +736,11 @@ class Triple {
     return getArch() == Triple::x86 || getArch() == Triple::x86_64;
   }
 
+  /// Tests whether the target is VE
+  bool isVE() const {
+    return getArch() == Triple::ve;
+  }
+
   /// Tests whether the target supports comdat
   bool supportsCOMDAT() const {
     return !isOSBinFormatMachO();
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index f2debc443d240..2af9ec7b9bc81 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -70,6 +70,7 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case wasm64:         return "wasm64";
   case renderscript32: return "renderscript32";
   case renderscript64: return "renderscript64";
+  case ve:             return "ve";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -144,6 +145,8 @@ StringRef Triple::getArchTypePrefix(ArchType Kind) {
 
   case riscv32:
   case riscv64:     return "riscv";
+
+  case ve:          return "ve";
   }
 }
 
@@ -313,6 +316,7 @@ Triple::ArchType Triple::getArchTypeForLLVMName(StringRef Name) {
     .Case("wasm64", wasm64)
     .Case("renderscript32", renderscript32)
     .Case("renderscript64", renderscript64)
+    .Case("ve", ve)
     .Default(UnknownArch);
 }
 
@@ -441,6 +445,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("wasm64", Triple::wasm64)
     .Case("renderscript32", Triple::renderscript32)
     .Case("renderscript64", Triple::renderscript64)
+    .Case("ve", Triple::ve)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -700,6 +705,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::tcele:
   case Triple::thumbeb:
   case Triple::xcore:
+  case Triple::ve:
     return Triple::ELF;
 
   case Triple::ppc:
@@ -1283,6 +1289,7 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
   case llvm::Triple::spir64:
   case llvm::Triple::wasm64:
   case llvm::Triple::renderscript64:
+  case llvm::Triple::ve:
     return 64;
   }
   llvm_unreachable("Invalid architecture value");
@@ -1311,6 +1318,7 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::msp430:
   case Triple::systemz:
   case Triple::ppc64le:
+  case Triple::ve:
     T.setArch(UnknownArch);
     break;
 
@@ -1403,6 +1411,7 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::x86_64:
   case Triple::wasm64:
   case Triple::renderscript64:
+  case Triple::ve:
     // Already 64-bit.
     break;
 
@@ -1461,6 +1470,7 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::xcore:
   case Triple::renderscript32:
   case Triple::renderscript64:
+  case Triple::ve:
 
   // ARM is intentionally unsupported here, changing the architecture would
   // drop any arch suffixes.
@@ -1552,6 +1562,7 @@ bool Triple::isLittleEndian() const {
   case Triple::tcele:
   case Triple::renderscript32:
   case Triple::renderscript64:
+  case Triple::ve:
     return true;
   default:
     return false;
diff --git a/llvm/lib/Target/LLVMBuild.txt b/llvm/lib/Target/LLVMBuild.txt
index d6a95a3c67133..7403f7713a9f6 100644
--- a/llvm/lib/Target/LLVMBuild.txt
+++ b/llvm/lib/Target/LLVMBuild.txt
@@ -36,6 +36,7 @@ subdirectories =
  WebAssembly
  X86
  XCore
+ VE
 
 ; This is a special group whose required libraries are extended (by llvm-build)
 ; with the best execution engine (the native JIT, if available, or the
diff --git a/llvm/lib/Target/VE/CMakeLists.txt b/llvm/lib/Target/VE/CMakeLists.txt
new file mode 100644
index 0000000000000..a3eb8bae4ac4a
--- /dev/null
+++ b/llvm/lib/Target/VE/CMakeLists.txt
@@ -0,0 +1,8 @@
+set(LLVM_TARGET_DEFINITIONS VE.td)
+
+add_llvm_target(VECodeGen
+  VETargetMachine.cpp
+  )
+
+add_subdirectory(TargetInfo)
+add_subdirectory(MCTargetDesc)
diff --git a/llvm/lib/Target/VE/LLVMBuild.txt b/llvm/lib/Target/VE/LLVMBuild.txt
new file mode 100644
index 0000000000000..b45efd45c8aca
--- /dev/null
+++ b/llvm/lib/Target/VE/LLVMBuild.txt
@@ -0,0 +1,33 @@
+;===- ./lib/Target/VE/LLVMBuild.txt ----------------------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[common]
+subdirectories = MCTargetDesc TargetInfo
+
+[component_0]
+type = TargetGroup
+name = VE
+parent = Target
+has_asmparser = 0
+has_asmprinter = 0
+
+[component_1]
+type = Library
+name = VECodeGen
+parent = VE
+required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG 
+                     VEDesc VEInfo Support Target
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
new file mode 100644
index 0000000000000..fa2fefbe47f05
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_library(LLVMVEDesc
+  VEMCTargetDesc.cpp
+  )
diff --git a/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt b/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt
new file mode 100644
index 0000000000000..e585042e60bba
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Target/VE/MCTargetDesc/LLVMBuild.txt ---------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = VEDesc
+parent = VE
+required_libraries = MC VEInfo Support
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
new file mode 100644
index 0000000000000..7067f34a016f7
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -0,0 +1,19 @@
+//===-- VEMCTargetDesc.cpp - VE Target Descriptions -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides VE specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEMCTargetDesc.h"
+
+using namespace llvm;
+
+extern "C" void LLVMInitializeVETargetMC() {
+  // TODO
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
new file mode 100644
index 0000000000000..a7969042606c0
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -0,0 +1,27 @@
+//===-- VEMCTargetDesc.h - VE Target Descriptions ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides VE specific target descriptions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCTARGETDESC_H
+#define LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCTARGETDESC_H
+
+#include "llvm/Support/DataTypes.h"
+
+#include <memory>
+
+namespace llvm {
+
+class Target;
+Target &getTheVETarget();
+
+} // end llvm namespace
+
+#endif
diff --git a/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt b/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt
new file mode 100644
index 0000000000000..0850b0f27bf2a
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/CMakeLists.txt
@@ -0,0 +1,3 @@
+add_llvm_component_library(LLVMVEInfo
+  VETargetInfo.cpp
+  )
diff --git a/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt b/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt
new file mode 100644
index 0000000000000..c440132476a4d
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/LLVMBuild.txt
@@ -0,0 +1,22 @@
+;===- ./lib/Target/VE/TargetInfo/LLVMBuild.txt -----------------*- Conf -*--===;
+;
+; Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+; See https://llvm.org/LICENSE.txt for license information.
+; SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+;
+;===------------------------------------------------------------------------===;
+;
+; This is an LLVMBuild description file for the components in this subdirectory.
+;
+; For more information on the LLVMBuild system, please see:
+;
+;   http://llvm.org/docs/LLVMBuild.html
+;
+;===------------------------------------------------------------------------===;
+
+[component_0]
+type = Library
+name = VEInfo
+parent = VE
+required_libraries = Support
+add_to_library_groups = VE
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
new file mode 100644
index 0000000000000..be68fe7d24291
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -0,0 +1,23 @@
+//===-- VETargetInfo.cpp - VE Target Implementation -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VE.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+Target &llvm::getTheVETarget() {
+  static Target TheVETarget;
+  return TheVETarget;
+}
+
+extern "C" void LLVMInitializeVETargetInfo() {
+  RegisterTarget<Triple::ve, /*HasJIT=*/false> X(getTheVETarget(), "ve",
+                                                 "VE", "VE");
+}
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
new file mode 100644
index 0000000000000..51d3e701f8ec0
--- /dev/null
+++ b/llvm/lib/Target/VE/VE.h
@@ -0,0 +1,19 @@
+//===-- VE.h - Top-level interface for VE representation --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the entry points for global functions defined in the LLVM
+// VE back-end.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VE_H
+#define LLVM_LIB_TARGET_VE_VE_H
+
+#include "MCTargetDesc/VEMCTargetDesc.h"
+
+#endif
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
new file mode 100644
index 0000000000000..10fe9ba0e7ebc
--- /dev/null
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -0,0 +1,62 @@
+//===-- VETargetMachine.cpp - Define TargetMachine for VE -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+#include "VETargetMachine.h"
+#include "VE.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve"
+
+extern "C" void LLVMInitializeVETarget() {
+  // Register the target.
+  RegisterTargetMachine<VETargetMachine> X(getTheVETarget());
+}
+
+static std::string computeDataLayout(const Triple &T) {
+  // Aurora VE is little endian
+  std::string Ret = "e";
+
+  // Use ELF mangling
+  Ret += "-m:e";
+
+  // Alignments for 64 bit integers.
+  Ret += "-i64:64";
+
+  // VE supports 32 bit and 64 bits integer on registers
+  Ret += "-n32:64";
+
+  // Stack alignment is 64 bits
+  Ret += "-S64";
+
+  return Ret;
+}
+
+static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
+  if (!RM.hasValue())
+    return Reloc::Static;
+  return *RM;
+}
+
+/// Create an Aurora VE architecture model
+VETargetMachine::VETargetMachine(
+    const Target &T, const Triple &TT, StringRef CPU, StringRef FS,
+    const TargetOptions &Options, Optional<Reloc::Model> RM,
+    Optional<CodeModel::Model> CM, CodeGenOpt::Level OL, bool JIT)
+    : LLVMTargetMachine(
+          T, computeDataLayout(TT), TT, CPU, FS, Options,
+          getEffectiveRelocModel(RM),
+          getEffectiveCodeModel(CM, CodeModel::Small),
+          OL)
+{}
+
+VETargetMachine::~VETargetMachine() {}
diff --git a/llvm/lib/Target/VE/VETargetMachine.h b/llvm/lib/Target/VE/VETargetMachine.h
new file mode 100644
index 0000000000000..ac6089036ff8e
--- /dev/null
+++ b/llvm/lib/Target/VE/VETargetMachine.h
@@ -0,0 +1,31 @@
+//===-- VETargetMachine.h - Define TargetMachine for VE ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the VE specific subclass of TargetMachine.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_VETARGETMACHINE_H
+#define LLVM_LIB_TARGET_VE_VETARGETMACHINE_H
+
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+
+class VETargetMachine : public LLVMTargetMachine {
+public:
+  VETargetMachine(const Target &T, const Triple &TT, StringRef CPU,
+                  StringRef FS, const TargetOptions &Options,
+                  Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+                  CodeGenOpt::Level OL, bool JIT);
+  ~VETargetMachine() override;
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/test/CodeGen/VE/lit.local.cfg b/llvm/test/CodeGen/VE/lit.local.cfg
new file mode 100644
index 0000000000000..b6366779272df
--- /dev/null
+++ b/llvm/test/CodeGen/VE/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'VE' in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/CodeGen/VE/target_support.ll b/llvm/test/CodeGen/VE/target_support.ll
new file mode 100644
index 0000000000000..336d9cd367208
--- /dev/null
+++ b/llvm/test/CodeGen/VE/target_support.ll
@@ -0,0 +1,2 @@
+; RUN: llc --version | FileCheck %s
+; CHECK:    ve     - VE
diff --git a/llvm/unittests/ADT/TripleTest.cpp b/llvm/unittests/ADT/TripleTest.cpp
index c7f40dd1ca143..ef7f82d268e2e 100644
--- a/llvm/unittests/ADT/TripleTest.cpp
+++ b/llvm/unittests/ADT/TripleTest.cpp
@@ -319,6 +319,12 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::AMDPAL, T.getOS());
   EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
 
+  T = Triple("ve-unknown-linux");
+  EXPECT_EQ(Triple::ve, T.getArch());
+  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
+  EXPECT_EQ(Triple::Linux, T.getOS());
+  EXPECT_EQ(Triple::UnknownEnvironment, T.getEnvironment());
+
   T = Triple("riscv32-unknown-unknown");
   EXPECT_EQ(Triple::riscv32, T.getArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
@@ -722,6 +728,8 @@ TEST(TripleTest, Normalization) {
             Triple::normalize("i686-linux")); // i686-pc-linux-gnu
   EXPECT_EQ("arm-none-unknown-eabi",
             Triple::normalize("arm-none-eabi")); // arm-none-eabi
+  EXPECT_EQ("ve-unknown-linux",
+            Triple::normalize("ve-linux")); // ve-linux
   EXPECT_EQ("wasm32-unknown-wasi",
             Triple::normalize("wasm32-wasi")); // wasm32-unknown-wasi
   EXPECT_EQ("wasm64-unknown-wasi",

From 7adf6644f50a60a25f81c976b0f3a08320a63665 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 9 Jan 2020 09:52:04 +0000
Subject: [PATCH 37/89] [Matrix] Propagate and use shape information for loads.

This patch extends to shape propagation to also include load
instructions and implements shape aware lowering for vector loads.

Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D70900
---
 .../Scalar/LowerMatrixIntrinsics.cpp          |  42 +++-
 .../bigger-expressions-double.ll              | 236 +++++++++---------
 .../propagate-backward.ll                     |  27 +-
 3 files changed, 169 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index c39fdac93b987..afe1b4e7cc784 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -95,20 +95,20 @@ Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride,
   unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
 
   // Compute the start of the column with index Col as Col * Stride.
-  Value *ColumnStart = Builder.CreateMul(Col, Stride);
+  Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start");
 
   // Get pointer to the start of the selected column. Skip GEP creation,
   // if we select column 0.
   if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero())
     ColumnStart = BasePtr;
   else
-    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart);
+    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep");
 
   // Cast elementwise column start pointer to a pointer to a column
   // (EltType x NumRows)*.
   Type *ColumnType = VectorType::get(EltType, NumRows);
   Type *ColumnPtrType = PointerType::get(ColumnType, AS);
-  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType);
+  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast");
 }
 
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
@@ -317,7 +317,7 @@ class LowerMatrixIntrinsics {
       default:
         return false;
       }
-    return isUniformShape(V) || isa<StoreInst>(V);
+    return isUniformShape(V) || isa<StoreInst>(V) || isa<LoadInst>(V);
   }
 
   /// Propagate the shape information of instructions to their users.
@@ -481,6 +481,8 @@ class LowerMatrixIntrinsics {
         Value *Op2;
         if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
           Changed |= VisitBinaryOperator(BinOp);
+        if (match(&Inst, m_Load(m_Value(Op1))))
+          Changed |= VisitLoad(&Inst, Op1, Builder);
         else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2))))
           Changed |= VisitStore(&Inst, Op1, Op2, Builder);
       }
@@ -495,7 +497,7 @@ class LowerMatrixIntrinsics {
   LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType,
                              IRBuilder<> Builder) {
     unsigned Align = DL.getABITypeAlignment(EltType);
-    return Builder.CreateAlignedLoad(ColumnPtr, Align);
+    return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load");
   }
 
   StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr,
@@ -536,17 +538,11 @@ class LowerMatrixIntrinsics {
     return true;
   }
 
-  /// Lowers llvm.matrix.columnwise.load.
-  ///
-  /// The intrinsic loads a matrix from memory using a stride between columns.
-  void LowerColumnwiseLoad(CallInst *Inst) {
+  void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride,
+                 ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
-    Value *Ptr = Inst->getArgOperand(0);
-    Value *Stride = Inst->getArgOperand(1);
     auto VType = cast<VectorType>(Inst->getType());
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
-    ShapeInfo Shape(Inst->getArgOperand(2), Inst->getArgOperand(3));
-
     ColumnMatrixTy Result;
     // Distance between start of one column and the start of the next
     for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) {
@@ -560,6 +556,16 @@ class LowerMatrixIntrinsics {
     finalizeLowering(Inst, Result, Builder);
   }
 
+  /// Lowers llvm.matrix.columnwise.load.
+  ///
+  /// The intrinsic loads a matrix from memory using a stride between columns.
+  void LowerColumnwiseLoad(CallInst *Inst) {
+    Value *Ptr = Inst->getArgOperand(0);
+    Value *Stride = Inst->getArgOperand(1);
+    LowerLoad(Inst, Ptr, Stride,
+              {Inst->getArgOperand(2), Inst->getArgOperand(3)});
+  }
+
   void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride,
                   ShapeInfo Shape) {
     IRBuilder<> Builder(Inst);
@@ -755,6 +761,16 @@ class LowerMatrixIntrinsics {
     finalizeLowering(Inst, Result, Builder);
   }
 
+  /// Lower load instructions, if shape information is available.
+  bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) {
+    auto I = ShapeMap.find(Inst);
+    if (I == ShapeMap.end())
+      return false;
+
+    LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    return true;
+  }
+
   bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr,
                   IRBuilder<> &Builder) {
     auto I = ShapeMap.find(StoredVal);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
index 246f4e42c2fa7..b72ecf5c0e4c0 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/bigger-expressions-double.ll
@@ -6,59 +6,63 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-LABEL: @transpose_multiply(
 ; CHECK-NEXT:  entry:
 
-; Load input matrixes %A and %B.
-
-; CHECK-NEXT:    [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]]
-
-; Extract columns from loaded value %A.
-
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; Load columns of input matrixes %A and %B.
+
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
 ; Transpose %A.
 
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
 
-; Extract columns from %B.
-
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
-
 ; Lower multiply(transpose(%A), %B)
 
 ; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT7]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK6]], [[SPLAT_SPLAT8]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = fadd <1 x double> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
@@ -66,18 +70,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP26]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP28]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
 ; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP30]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = fadd <1 x double> [[TMP29]], [[TMP31]]
 ; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
@@ -85,18 +89,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <1 x double> [[TMP35]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <3 x double> [[TMP27]], <3 x double> [[TMP36]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP38]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
 ; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP40]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = fadd <1 x double> [[TMP39]], [[TMP41]]
 ; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
@@ -104,18 +108,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <1 x double> [[TMP45]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <3 x double> [[TMP37]], <3 x double> [[TMP46]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP48]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
 ; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP50]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = fadd <1 x double> [[TMP49]], [[TMP51]]
 ; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
@@ -123,18 +127,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <1 x double> [[TMP55]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP56]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP58]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP59:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
 ; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP60]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = fadd <1 x double> [[TMP59]], [[TMP61]]
 ; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
@@ -142,18 +146,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <1 x double> [[TMP65]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <3 x double> [[TMP57]], <3 x double> [[TMP66]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP68]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP69:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
 ; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP70]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fadd <1 x double> [[TMP69]], [[TMP71]]
 ; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
@@ -161,18 +165,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <1 x double> [[TMP75]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <3 x double> [[TMP67]], <3 x double> [[TMP76]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP78]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP79:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
 ; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP80]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = fadd <1 x double> [[TMP79]], [[TMP81]]
 ; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
@@ -180,18 +184,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <1 x double> [[TMP85]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP86]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP88]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP89:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
 ; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP90]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = fadd <1 x double> [[TMP89]], [[TMP91]]
 ; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
@@ -199,18 +203,18 @@ define void @transpose_multiply(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr, <9 x
 ; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <1 x double> [[TMP95]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <3 x double> [[TMP87]], <3 x double> [[TMP96]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP98]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
 ; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP100]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
 ; CHECK-NEXT:    [[TMP102:%.*]] = fadd <1 x double> [[TMP99]], [[TMP101]]
 ; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
@@ -248,59 +252,61 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-LABEL: @transpose_multiply_add(
 ; CHECK-NEXT:  entry:
 
-; Load input matrixes %A and %B.
-
-; CHECK-NEXT:    [[A:%.*]] = load <9 x double>, <9 x double>* [[A_PTR:%.*]]
-; CHECK-NEXT:    [[B:%.*]] = load <9 x double>, <9 x double>* [[B_PTR:%.*]]
-
-; Extract columns from loaded value %A.
-
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <9 x double> [[A]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <9 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 3
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <9 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST6:%.*]] = bitcast double* [[TMP1]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD7:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST6]], align 8
+; CHECK-NEXT:    [[COL_GEP8:%.*]] = getelementptr double, double* [[TMP1]], i32 3
+; CHECK-NEXT:    [[COL_CAST9:%.*]] = bitcast double* [[COL_GEP8]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD10:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST9]], align 8
+; CHECK-NEXT:    [[COL_GEP11:%.*]] = getelementptr double, double* [[TMP1]], i32 6
+; CHECK-NEXT:    [[COL_CAST12:%.*]] = bitcast double* [[COL_GEP11]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST12]], align 8
 
 ; Transpose %A.
 
-; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[SPLIT]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <3 x double> undef, double [[TMP0]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <3 x double> [[TMP1]], double [[TMP2]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <3 x double> [[TMP3]], double [[TMP4]], i64 2
-; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[SPLIT]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <3 x double> undef, double [[TMP6]], i64 0
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <3 x double> [[TMP7]], double [[TMP8]], i64 1
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <3 x double> [[TMP9]], double [[TMP10]], i64 2
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[SPLIT]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <3 x double> [[COL_LOAD]], i64 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <3 x double> undef, double [[TMP12]], i64 0
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[SPLIT1]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <3 x double> [[COL_LOAD2]], i64 2
 ; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <3 x double> [[TMP13]], double [[TMP14]], i64 1
-; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[SPLIT2]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <3 x double> [[COL_LOAD5]], i64 2
 ; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <3 x double> [[TMP15]], double [[TMP16]], i64 2
 
-; Extract columns from %B.
-
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <9 x double> [[B]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
-
 ; Lower multiply(transpose(%A), %B)
 
 ; CHECK-NEXT:    [[BLOCK:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT:%.*]] = insertelement <1 x double> undef, double [[TMP18]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul <1 x double> [[BLOCK]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[BLOCK6:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT7:%.*]] = insertelement <1 x double> undef, double [[TMP20]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT8:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT7]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul <1 x double> [[BLOCK6]], [[SPLAT_SPLAT8]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = fadd <1 x double> [[TMP19]], [[TMP21]]
 ; CHECK-NEXT:    [[BLOCK9:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP23:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT10:%.*]] = insertelement <1 x double> undef, double [[TMP23]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT11:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT10]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fmul <1 x double> [[BLOCK9]], [[SPLAT_SPLAT11]]
@@ -308,18 +314,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP26:%.*]] = shufflevector <1 x double> [[TMP25]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP27:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP26]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK12:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT13:%.*]] = insertelement <1 x double> undef, double [[TMP28]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT14:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT13]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <1 x double> [[BLOCK12]], [[SPLAT_SPLAT14]]
 ; CHECK-NEXT:    [[BLOCK15:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT16:%.*]] = insertelement <1 x double> undef, double [[TMP30]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT17:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT16]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <1 x double> [[BLOCK15]], [[SPLAT_SPLAT17]]
 ; CHECK-NEXT:    [[TMP32:%.*]] = fadd <1 x double> [[TMP29]], [[TMP31]]
 ; CHECK-NEXT:    [[BLOCK18:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT19:%.*]] = insertelement <1 x double> undef, double [[TMP33]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT19]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP34:%.*]] = fmul <1 x double> [[BLOCK18]], [[SPLAT_SPLAT20]]
@@ -327,18 +333,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <1 x double> [[TMP35]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <3 x double> [[TMP27]], <3 x double> [[TMP36]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK21:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 0
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT22:%.*]] = insertelement <1 x double> undef, double [[TMP38]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT22]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP39:%.*]] = fmul <1 x double> [[BLOCK21]], [[SPLAT_SPLAT23]]
 ; CHECK-NEXT:    [[BLOCK24:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 1
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT25:%.*]] = insertelement <1 x double> undef, double [[TMP40]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT26:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT25]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = fmul <1 x double> [[BLOCK24]], [[SPLAT_SPLAT26]]
 ; CHECK-NEXT:    [[TMP42:%.*]] = fadd <1 x double> [[TMP39]], [[TMP41]]
 ; CHECK-NEXT:    [[BLOCK27:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[SPLIT3]], i64 2
+; CHECK-NEXT:    [[TMP43:%.*]] = extractelement <3 x double> [[COL_LOAD7]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT28:%.*]] = insertelement <1 x double> undef, double [[TMP43]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT28]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul <1 x double> [[BLOCK27]], [[SPLAT_SPLAT29]]
@@ -346,18 +352,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP46:%.*]] = shufflevector <1 x double> [[TMP45]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP47:%.*]] = shufflevector <3 x double> [[TMP37]], <3 x double> [[TMP46]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK30:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP48:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT31:%.*]] = insertelement <1 x double> undef, double [[TMP48]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT32:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT31]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP49:%.*]] = fmul <1 x double> [[BLOCK30]], [[SPLAT_SPLAT32]]
 ; CHECK-NEXT:    [[BLOCK33:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP50:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT34:%.*]] = insertelement <1 x double> undef, double [[TMP50]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT35:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT34]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP51:%.*]] = fmul <1 x double> [[BLOCK33]], [[SPLAT_SPLAT35]]
 ; CHECK-NEXT:    [[TMP52:%.*]] = fadd <1 x double> [[TMP49]], [[TMP51]]
 ; CHECK-NEXT:    [[BLOCK36:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP53:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT37:%.*]] = insertelement <1 x double> undef, double [[TMP53]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT38:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT37]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP54:%.*]] = fmul <1 x double> [[BLOCK36]], [[SPLAT_SPLAT38]]
@@ -365,18 +371,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP56:%.*]] = shufflevector <1 x double> [[TMP55]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP57:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP56]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK39:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP58:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT40:%.*]] = insertelement <1 x double> undef, double [[TMP58]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT41:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT40]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP59:%.*]] = fmul <1 x double> [[BLOCK39]], [[SPLAT_SPLAT41]]
 ; CHECK-NEXT:    [[BLOCK42:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP60:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT43:%.*]] = insertelement <1 x double> undef, double [[TMP60]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT44:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT43]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP61:%.*]] = fmul <1 x double> [[BLOCK42]], [[SPLAT_SPLAT44]]
 ; CHECK-NEXT:    [[TMP62:%.*]] = fadd <1 x double> [[TMP59]], [[TMP61]]
 ; CHECK-NEXT:    [[BLOCK45:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP63:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT46:%.*]] = insertelement <1 x double> undef, double [[TMP63]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT46]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP64:%.*]] = fmul <1 x double> [[BLOCK45]], [[SPLAT_SPLAT47]]
@@ -384,18 +390,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP66:%.*]] = shufflevector <1 x double> [[TMP65]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <3 x double> [[TMP57]], <3 x double> [[TMP66]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK48:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 0
+; CHECK-NEXT:    [[TMP68:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT49:%.*]] = insertelement <1 x double> undef, double [[TMP68]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT49]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP69:%.*]] = fmul <1 x double> [[BLOCK48]], [[SPLAT_SPLAT50]]
 ; CHECK-NEXT:    [[BLOCK51:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 1
+; CHECK-NEXT:    [[TMP70:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT52:%.*]] = insertelement <1 x double> undef, double [[TMP70]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT53:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT52]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP71:%.*]] = fmul <1 x double> [[BLOCK51]], [[SPLAT_SPLAT53]]
 ; CHECK-NEXT:    [[TMP72:%.*]] = fadd <1 x double> [[TMP69]], [[TMP71]]
 ; CHECK-NEXT:    [[BLOCK54:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[SPLIT4]], i64 2
+; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <3 x double> [[COL_LOAD10]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT55:%.*]] = insertelement <1 x double> undef, double [[TMP73]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT56:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT55]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP74:%.*]] = fmul <1 x double> [[BLOCK54]], [[SPLAT_SPLAT56]]
@@ -403,18 +409,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP76:%.*]] = shufflevector <1 x double> [[TMP75]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP77:%.*]] = shufflevector <3 x double> [[TMP67]], <3 x double> [[TMP76]], <3 x i32> <i32 0, i32 1, i32 3>
 ; CHECK-NEXT:    [[BLOCK57:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP78:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT58:%.*]] = insertelement <1 x double> undef, double [[TMP78]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT58]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP79:%.*]] = fmul <1 x double> [[BLOCK57]], [[SPLAT_SPLAT59]]
 ; CHECK-NEXT:    [[BLOCK60:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP80:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT61:%.*]] = insertelement <1 x double> undef, double [[TMP80]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT61]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP81:%.*]] = fmul <1 x double> [[BLOCK60]], [[SPLAT_SPLAT62]]
 ; CHECK-NEXT:    [[TMP82:%.*]] = fadd <1 x double> [[TMP79]], [[TMP81]]
 ; CHECK-NEXT:    [[BLOCK63:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP83:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT64:%.*]] = insertelement <1 x double> undef, double [[TMP83]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT65:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT64]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP84:%.*]] = fmul <1 x double> [[BLOCK63]], [[SPLAT_SPLAT65]]
@@ -422,18 +428,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP86:%.*]] = shufflevector <1 x double> [[TMP85]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP87:%.*]] = shufflevector <3 x double> undef, <3 x double> [[TMP86]], <3 x i32> <i32 3, i32 1, i32 2>
 ; CHECK-NEXT:    [[BLOCK66:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP88:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT67:%.*]] = insertelement <1 x double> undef, double [[TMP88]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT68:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT67]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP89:%.*]] = fmul <1 x double> [[BLOCK66]], [[SPLAT_SPLAT68]]
 ; CHECK-NEXT:    [[BLOCK69:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP90:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT70:%.*]] = insertelement <1 x double> undef, double [[TMP90]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT71:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT70]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP91:%.*]] = fmul <1 x double> [[BLOCK69]], [[SPLAT_SPLAT71]]
 ; CHECK-NEXT:    [[TMP92:%.*]] = fadd <1 x double> [[TMP89]], [[TMP91]]
 ; CHECK-NEXT:    [[BLOCK72:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 1>
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT73:%.*]] = insertelement <1 x double> undef, double [[TMP93]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT74:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT73]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP94:%.*]] = fmul <1 x double> [[BLOCK72]], [[SPLAT_SPLAT74]]
@@ -441,18 +447,18 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 ; CHECK-NEXT:    [[TMP96:%.*]] = shufflevector <1 x double> [[TMP95]], <1 x double> undef, <3 x i32> <i32 0, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[TMP97:%.*]] = shufflevector <3 x double> [[TMP87]], <3 x double> [[TMP96]], <3 x i32> <i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[BLOCK75:%.*]] = shufflevector <3 x double> [[TMP5]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 0
+; CHECK-NEXT:    [[TMP98:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 0
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT76:%.*]] = insertelement <1 x double> undef, double [[TMP98]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT77:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT76]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP99:%.*]] = fmul <1 x double> [[BLOCK75]], [[SPLAT_SPLAT77]]
 ; CHECK-NEXT:    [[BLOCK78:%.*]] = shufflevector <3 x double> [[TMP11]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 1
+; CHECK-NEXT:    [[TMP100:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 1
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT79:%.*]] = insertelement <1 x double> undef, double [[TMP100]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT80:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT79]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP101:%.*]] = fmul <1 x double> [[BLOCK78]], [[SPLAT_SPLAT80]]
 ; CHECK-NEXT:    [[TMP102:%.*]] = fadd <1 x double> [[TMP99]], [[TMP101]]
 ; CHECK-NEXT:    [[BLOCK81:%.*]] = shufflevector <3 x double> [[TMP17]], <3 x double> undef, <1 x i32> <i32 2>
-; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[SPLIT5]], i64 2
+; CHECK-NEXT:    [[TMP103:%.*]] = extractelement <3 x double> [[COL_LOAD13]], i64 2
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT82:%.*]] = insertelement <1 x double> undef, double [[TMP103]], i32 0
 ; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <1 x double> [[SPLAT_SPLATINSERT82]], <1 x double> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP104:%.*]] = fmul <1 x double> [[BLOCK81]], [[SPLAT_SPLAT83]]
@@ -465,19 +471,21 @@ define void @transpose_multiply_add(<9 x double>* %A.Ptr, <9 x double>* %B.Ptr,
 
 ; Load %C.
 
-; CHECK-NEXT:    [[C:%.*]] = load <9 x double>, <9 x double>* [[C_PTR:%.*]]
-
-; Extract columns from %C.
-
-; CHECK-NEXT:    [[SPLIT84:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 0, i32 1, i32 2>
-; CHECK-NEXT:    [[SPLIT85:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 3, i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT86:%.*]] = shufflevector <9 x double> [[C]], <9 x double> undef, <3 x i32> <i32 6, i32 7, i32 8>
+; CHECK-NEXT:    [[TMP110:%.*]] = bitcast <9 x double>* [[C_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST92:%.*]] = bitcast double* [[TMP110]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD93:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST92]], align 8
+; CHECK-NEXT:    [[COL_GEP94:%.*]] = getelementptr double, double* [[TMP110]], i32 3
+; CHECK-NEXT:    [[COL_CAST95:%.*]] = bitcast double* [[COL_GEP94]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST95]], align 8
+; CHECK-NEXT:    [[COL_GEP97:%.*]] = getelementptr double, double* [[TMP110]], i32 6
+; CHECK-NEXT:    [[COL_CAST98:%.*]] = bitcast double* [[COL_GEP97]] to <3 x double>*
+; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <3 x double>, <3 x double>* [[COL_CAST98]], align 8
 
 ; Add column vectors.
 
-; CHECK-NEXT:    [[TMP108:%.*]] = fadd <3 x double> [[SPLIT84]], [[TMP47]]
-; CHECK-NEXT:    [[TMP109:%.*]] = fadd <3 x double> [[SPLIT85]], [[TMP77]]
-; CHECK-NEXT:    [[TMP110:%.*]] = fadd <3 x double> [[SPLIT86]], [[TMP107]]
+; CHECK-NEXT:    [[TMP108:%.*]] = fadd <3 x double> [[COL_LOAD93]], [[TMP47]]
+; CHECK-NEXT:    [[TMP109:%.*]] = fadd <3 x double> [[COL_LOAD96]], [[TMP77]]
+; CHECK-NEXT:    [[TMP110:%.*]] = fadd <3 x double> [[COL_LOAD99]], [[TMP107]]
 
 ; Store result columns.
 
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
index f4d7c0e5120e7..89ca79649b879 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-backward.ll
@@ -45,19 +45,28 @@ entry:
 define <8 x double> @load_fadd_transpose(<8 x double>* %A.Ptr, <8 x double> %b) {
 ; CHECK-LABEL: @load_fadd_transpose(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A:%.*]] = load <8 x double>, <8 x double>* [[A_PTR:%.*]]
-; CHECK-NEXT:    [[SPLIT:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
-; CHECK-NEXT:    [[SPLIT1:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
-; CHECK-NEXT:    [[SPLIT2:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
-; CHECK-NEXT:    [[SPLIT3:%.*]] = shufflevector <8 x double> [[A]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast <8 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[COL_CAST:%.*]] = bitcast double* [[TMP0]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST]], align 8
+; CHECK-NEXT:    [[COL_GEP:%.*]] = getelementptr double, double* [[TMP0]], i32 2
+; CHECK-NEXT:    [[COL_CAST1:%.*]] = bitcast double* [[COL_GEP]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST1]], align 8
+; CHECK-NEXT:    [[COL_GEP3:%.*]] = getelementptr double, double* [[TMP0]], i32 4
+; CHECK-NEXT:    [[COL_CAST4:%.*]] = bitcast double* [[COL_GEP3]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST4]], align 8
+; CHECK-NEXT:    [[COL_GEP6:%.*]] = getelementptr double, double* [[TMP0]], i32 6
+; CHECK-NEXT:    [[COL_CAST7:%.*]] = bitcast double* [[COL_GEP6]] to <2 x double>*
+; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, <2 x double>* [[COL_CAST7]], align 8
 ; CHECK-NEXT:    [[SPLIT4:%.*]] = shufflevector <8 x double> [[B:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[SPLIT5:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
 ; CHECK-NEXT:    [[SPLIT6:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
 ; CHECK-NEXT:    [[SPLIT7:%.*]] = shufflevector <8 x double> [[B]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
-; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[SPLIT]], [[SPLIT4]]
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[SPLIT1]], [[SPLIT5]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[SPLIT2]], [[SPLIT6]]
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[SPLIT3]], [[SPLIT7]]
+
+; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[COL_LOAD]], [[SPLIT4]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <2 x double> [[COL_LOAD2]], [[SPLIT5]]
+; CHECK-NEXT:    [[TMP2:%.*]] = fadd <2 x double> [[COL_LOAD5]], [[SPLIT6]]
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[COL_LOAD8]], [[SPLIT7]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x double> [[TMP0]], i64 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x double> undef, double [[TMP4]], i64 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x double> [[TMP1]], i64 0

From 287a874d1c460302677a1530a75d94bae4d4a348 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Fri, 13 Dec 2019 09:34:59 +0100
Subject: [PATCH 38/89] [clangd] Refurbish HoverInfo::present

Summary: Improves basic hover presentation logic to include more info.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D71555
---
 clang-tools-extra/clangd/Hover.cpp            | 96 +++++++++++++------
 clang-tools-extra/clangd/test/hover.test      |  4 +-
 .../clangd/unittests/HoverTests.cpp           | 89 +++++++++++++++++
 3 files changed, 157 insertions(+), 32 deletions(-)

diff --git a/clang-tools-extra/clangd/Hover.cpp b/clang-tools-extra/clangd/Hover.cpp
index b1a2e289eed79..20883b347fdc4 100644
--- a/clang-tools-extra/clangd/Hover.cpp
+++ b/clang-tools-extra/clangd/Hover.cpp
@@ -25,9 +25,11 @@
 #include "clang/AST/PrettyPrinter.h"
 #include "clang/Index/IndexSymbol.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/Support/Casting.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include <string>
 
 namespace clang {
 namespace clangd {
@@ -224,8 +226,8 @@ void enhanceFromIndex(HoverInfo &Hover, const NamedDecl &ND,
 
 // Populates Type, ReturnType, and Parameters for function-like decls.
 void fillFunctionTypeAndParams(HoverInfo &HI, const Decl *D,
-                                      const FunctionDecl *FD,
-                                      const PrintingPolicy &Policy) {
+                               const FunctionDecl *FD,
+                               const PrintingPolicy &Policy) {
   HI.Parameters.emplace();
   for (const ParmVarDecl *PVD : FD->parameters()) {
     HI.Parameters->emplace_back();
@@ -250,11 +252,11 @@ void fillFunctionTypeAndParams(HoverInfo &HI, const Decl *D,
     }
   }
 
-  if (const auto* CCD = llvm::dyn_cast<CXXConstructorDecl>(FD)) {
+  if (const auto *CCD = llvm::dyn_cast<CXXConstructorDecl>(FD)) {
     // Constructor's "return type" is the class type.
     HI.ReturnType = declaredType(CCD->getParent()).getAsString(Policy);
     // Don't provide any type for the constructor itself.
-  } else if (llvm::isa<CXXDestructorDecl>(FD)){
+  } else if (llvm::isa<CXXDestructorDecl>(FD)) {
     HI.ReturnType = "void";
   } else {
     HI.ReturnType = FD->getReturnType().getAsString(Policy);
@@ -309,7 +311,7 @@ llvm::Optional<std::string> printExprValue(const SelectionTree::Node *N,
 }
 
 /// Generate a \p Hover object given the declaration \p D.
-HoverInfo getHoverContents(const Decl *D, const SymbolIndex *Index) {
+HoverInfo getHoverContents(const NamedDecl *D, const SymbolIndex *Index) {
   HoverInfo HI;
   const ASTContext &Ctx = D->getASTContext();
 
@@ -321,12 +323,10 @@ HoverInfo getHoverContents(const Decl *D, const SymbolIndex *Index) {
     HI.LocalScope.append("::");
 
   PrintingPolicy Policy = printingPolicyForDecls(Ctx.getPrintingPolicy());
-  if (const NamedDecl *ND = llvm::dyn_cast<NamedDecl>(D)) {
-    HI.Name = printName(Ctx, *ND);
-    ND = getDeclForComment(ND);
-    HI.Documentation = getDeclComment(Ctx, *ND);
-    enhanceFromIndex(HI, *ND, Index);
-  }
+  HI.Name = printName(Ctx, *D);
+  const auto *CommentD = getDeclForComment(D);
+  HI.Documentation = getDeclComment(Ctx, *CommentD);
+  enhanceFromIndex(HI, *CommentD, Index);
 
   HI.Kind = index::getSymbolInfo(D).Kind;
 
@@ -460,34 +460,70 @@ llvm::Optional<HoverInfo> getHover(ParsedAST &AST, Position Pos,
           tooling::applyAllReplacements(HI->Definition, Replacements))
     HI->Definition = *Formatted;
 
-  HI->SymRange = getTokenRange(AST.getSourceManager(),
-                               AST.getLangOpts(), SourceLocationBeg);
+  HI->SymRange = getTokenRange(AST.getSourceManager(), AST.getLangOpts(),
+                               SourceLocationBeg);
   return HI;
 }
 
 markup::Document HoverInfo::present() const {
   markup::Document Output;
-  if (NamespaceScope) {
-    auto &P = Output.addParagraph();
-    P.appendText("Declared in");
-    // Drop trailing "::".
-    if (!LocalScope.empty())
-      P.appendCode(llvm::StringRef(LocalScope).drop_back(2));
-    else if (NamespaceScope->empty())
-      P.appendCode("global namespace");
-    else
-      P.appendCode(llvm::StringRef(*NamespaceScope).drop_back(2));
+  // Header contains a text of the form:
+  // variable `var` : `int`
+  //
+  // class `X`
+  //
+  // function `foo` → `int`
+  markup::Paragraph &Header = Output.addParagraph();
+  Header.appendText(index::getSymbolKindString(Kind));
+  assert(!Name.empty() && "hover triggered on a nameless symbol");
+  Header.appendCode(Name);
+  if (ReturnType) {
+    Header.appendText("→");
+    Header.appendCode(*ReturnType);
+  } else if (Type) {
+    Header.appendText(":");
+    Header.appendCode(*Type);
   }
 
-  if (!Definition.empty()) {
-    Output.addCodeBlock(Definition);
-  } else {
-    // Builtin types
-    Output.addCodeBlock(Name);
+  // For functions we display signature in a list form, e.g.:
+  // - `bool param1`
+  // - `int param2 = 5`
+  if (Parameters && !Parameters->empty()) {
+    markup::BulletList &L = Output.addBulletList();
+    for (const auto &Param : *Parameters) {
+      std::string Buffer;
+      llvm::raw_string_ostream OS(Buffer);
+      OS << Param;
+      L.addItem().addParagraph().appendCode(std::move(OS.str()));
+    }
+  }
+
+  if (Value) {
+    markup::Paragraph &P = Output.addParagraph();
+    P.appendText("Value =");
+    P.appendCode(*Value);
   }
 
   if (!Documentation.empty())
     Output.addParagraph().appendText(Documentation);
+
+  if (!Definition.empty()) {
+    std::string ScopeComment;
+    // Drop trailing "::".
+    if (!LocalScope.empty()) {
+      // Container name, e.g. class, method, function.
+      // We might want to propogate some info about container type to print
+      // function foo, class X, method X::bar, etc.
+      ScopeComment =
+          "// In " + llvm::StringRef(LocalScope).rtrim(':').str() + '\n';
+    } else if (NamespaceScope && !NamespaceScope->empty()) {
+      ScopeComment = "// In namespace " +
+                     llvm::StringRef(*NamespaceScope).rtrim(':').str() + '\n';
+    }
+    // Note that we don't print anything for global namespace, to not annoy
+    // non-c++ projects or projects that are not making use of namespaces.
+    Output.addCodeBlock(ScopeComment + Definition);
+  }
   return Output;
 }
 
diff --git a/clang-tools-extra/clangd/test/hover.test b/clang-tools-extra/clangd/test/hover.test
index e45164b346ea5..2162ff9abcdc3 100644
--- a/clang-tools-extra/clangd/test/hover.test
+++ b/clang-tools-extra/clangd/test/hover.test
@@ -9,7 +9,7 @@
 # CHECK-NEXT:  "result": {
 # CHECK-NEXT:    "contents": {
 # CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "Declared in global namespace\n\nvoid foo()"
+# CHECK-NEXT:      "value": "function foo → void\n\nvoid foo()"
 # CHECK-NEXT:    },
 # CHECK-NEXT:    "range": {
 # CHECK-NEXT:      "end": {
@@ -37,7 +37,7 @@
 # CHECK-NEXT:  "result": {
 # CHECK-NEXT:    "contents": {
 # CHECK-NEXT:      "kind": "plaintext",
-# CHECK-NEXT:      "value": "Declared in global namespace\n\nenum foo {}"
+# CHECK-NEXT:      "value": "enum foo\n\nenum foo {}"
 # CHECK-NEXT:    },
 # CHECK-NEXT:    "range": {
 # CHECK-NEXT:      "end": {
diff --git a/clang-tools-extra/clangd/unittests/HoverTests.cpp b/clang-tools-extra/clangd/unittests/HoverTests.cpp
index 4c6d2abbd24a1..44337688ff87a 100644
--- a/clang-tools-extra/clangd/unittests/HoverTests.cpp
+++ b/clang-tools-extra/clangd/unittests/HoverTests.cpp
@@ -1606,6 +1606,95 @@ TEST(Hover, DocsFromMostSpecial) {
     }
   }
 }
+TEST(Hover, Present) {
+  struct {
+    const std::function<void(HoverInfo &)> Builder;
+    llvm::StringRef ExpectedRender;
+  } Cases[] = {
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Unknown;
+            HI.Name = "X";
+          },
+          R"(<unknown> X)",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::NamespaceAlias;
+            HI.Name = "foo";
+          },
+          R"(namespace-alias foo)",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Class;
+            HI.TemplateParameters = {
+                {std::string("typename"), std::string("T"), llvm::None},
+                {std::string("typename"), std::string("C"),
+                 std::string("bool")},
+            };
+            HI.Documentation = "documentation";
+            HI.Definition =
+                "template <typename T, typename C = bool> class Foo {}";
+            HI.Name = "foo";
+            HI.NamespaceScope.emplace();
+          },
+          R"(class foo
+documentation
+
+template <typename T, typename C = bool> class Foo {})",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Function;
+            HI.Name = "foo";
+            HI.Type = "type";
+            HI.ReturnType = "ret_type";
+            HI.Parameters.emplace();
+            HoverInfo::Param P;
+            HI.Parameters->push_back(P);
+            P.Type = "type";
+            HI.Parameters->push_back(P);
+            P.Name = "foo";
+            HI.Parameters->push_back(P);
+            P.Default = "default";
+            HI.Parameters->push_back(P);
+            HI.NamespaceScope = "ns::";
+            HI.Definition = "ret_type foo(params) {}";
+          },
+          R"(function foo → ret_type
+- 
+- type
+- type foo
+- type foo = default
+
+// In namespace ns
+ret_type foo(params) {})",
+      },
+      {
+          [](HoverInfo &HI) {
+            HI.Kind = index::SymbolKind::Variable;
+            HI.LocalScope = "test::bar::";
+            HI.Value = "value";
+            HI.Name = "foo";
+            HI.Type = "type";
+            HI.Definition = "def";
+          },
+          R"(variable foo : type
+Value = value
+
+// In test::bar
+def)",
+      },
+  };
+
+  for (const auto &C : Cases) {
+    HoverInfo HI;
+    C.Builder(HI);
+    EXPECT_EQ(HI.present().asPlainText(), C.ExpectedRender);
+  }
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang

From ccf24225e3f2356ebf0e73bb114a831bf1721222 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 9 Jan 2020 10:23:34 +0000
Subject: [PATCH 39/89] [Matrix] Update shape propagation to iterate until
 done.

This patch updates the shape propagation to iterate until no new shape
information is discovered.

As initial seed for the forward propagation, we use the matrix intrinsic
instructions. Both propagateShapeForward and propagateShapeBackward
return new work lists, with the instructions to be used for the next
iteration. When propagating forward, we record all instructions we added
new shape information for. When propagating backward, we record all
users of instructions we added new shape information for.

Reviewers: anemet, Gerolf, reames, hfinkel, andrew.w.kaylor

Reviewed By: anemet

Differential Revision: https://reviews.llvm.org/D70901
---
 .../Scalar/LowerMatrixIntrinsics.cpp          | 105 +++++++++++-------
 .../propagate-multiple-iterations.ll          |  84 ++++++++++++++
 2 files changed, 146 insertions(+), 43 deletions(-)
 create mode 100644 llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll

diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index afe1b4e7cc784..0ff6ee8bcfcc2 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -10,9 +10,6 @@
 //
 // TODO:
 //  * Implement multiply & add fusion
-//  * Implement shape propagation
-//  * Implement optimizations to reduce or eliminateshufflevector uses by using
-//    shape information.
 //  * Add remark, summarizing the available matrix optimization opportunities.
 //
 //===----------------------------------------------------------------------===//
@@ -321,32 +318,12 @@ class LowerMatrixIntrinsics {
   }
 
   /// Propagate the shape information of instructions to their users.
-  void propagateShapeForward() {
-    // The work list contains instructions for which we can compute the shape,
-    // either based on the information provided by matrix intrinsics or known
-    // shapes of operands.
-    SmallVector<Instruction *, 8> WorkList;
-
-    // Initialize the work list with ops carrying shape information. Initially
-    // only the shape of matrix intrinsics is known.
-    for (BasicBlock &BB : Func)
-      for (Instruction &Inst : BB) {
-        IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
-        if (!II)
-          continue;
-
-        switch (II->getIntrinsicID()) {
-        case Intrinsic::matrix_multiply:
-        case Intrinsic::matrix_transpose:
-        case Intrinsic::matrix_columnwise_load:
-        case Intrinsic::matrix_columnwise_store:
-          WorkList.push_back(&Inst);
-          break;
-        default:
-          break;
-        }
-      }
-
+  /// The work list contains instructions for which we can compute the shape,
+  /// either based on the information provided by matrix intrinsics or known
+  /// shapes of operands.
+  SmallVector<Instruction *, 32>
+  propagateShapeForward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
     // Pop an element for which we guaranteed to have at least one of the
     // operand shapes.  Add the shape for this and then add users to the work
     // list.
@@ -395,20 +372,29 @@ class LowerMatrixIntrinsics {
         }
       }
 
-      if (Propagate)
+      if (Propagate) {
+        NewWorkList.push_back(Inst);
         for (auto *User : Inst->users())
           if (ShapeMap.count(User) == 0)
             WorkList.push_back(cast<Instruction>(User));
+      }
     }
+
+    return NewWorkList;
   }
 
   /// Propagate the shape to operands of instructions with shape information.
-  void propagateShapeBackward() {
-    SmallVector<Value *, 8> WorkList;
-    // Worklist contains instruction for which we already know the shape.
-    for (auto &V : ShapeMap)
-      WorkList.push_back(V.first);
-
+  /// \p Worklist contains the instruction for which we already know the shape.
+  SmallVector<Instruction *, 32>
+  propagateShapeBackward(SmallVectorImpl<Instruction *> &WorkList) {
+    SmallVector<Instruction *, 32> NewWorkList;
+
+    auto pushInstruction = [](Value *V,
+                              SmallVectorImpl<Instruction *> &WorkList) {
+      Instruction *I = dyn_cast<Instruction>(V);
+      if (I)
+        WorkList.push_back(I);
+    };
     // Pop an element with known shape.  Traverse the operands, if their shape
     // derives from the result shape and is unknown, add it and add them to the
     // worklist.
@@ -417,6 +403,7 @@ class LowerMatrixIntrinsics {
       Value *V = WorkList.back();
       WorkList.pop_back();
 
+      size_t BeforeProcessingV = WorkList.size();
       if (!isa<Instruction>(V))
         continue;
 
@@ -429,21 +416,21 @@ class LowerMatrixIntrinsics {
                        m_Value(MatrixA), m_Value(MatrixB), m_Value(M),
                        m_Value(N), m_Value(K)))) {
         if (setShapeInfo(MatrixA, {M, N}))
-          WorkList.push_back(MatrixA);
+          pushInstruction(MatrixA, WorkList);
 
         if (setShapeInfo(MatrixB, {N, K}))
-          WorkList.push_back(MatrixB);
+          pushInstruction(MatrixB, WorkList);
 
       } else if (match(V, m_Intrinsic<Intrinsic::matrix_transpose>(
                               m_Value(MatrixA), m_Value(M), m_Value(N)))) {
         // Flip dimensions.
         if (setShapeInfo(MatrixA, {M, N}))
-          WorkList.push_back(MatrixA);
+          pushInstruction(MatrixA, WorkList);
       } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
                               m_Value(MatrixA), m_Value(), m_Value(),
                               m_Value(M), m_Value(N)))) {
         if (setShapeInfo(MatrixA, {M, N})) {
-          WorkList.push_back(MatrixA);
+          pushInstruction(MatrixA, WorkList);
         }
       } else if (isa<LoadInst>(V) ||
                  match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
@@ -456,16 +443,48 @@ class LowerMatrixIntrinsics {
         ShapeInfo Shape = ShapeMap[V];
         for (Use &U : cast<Instruction>(V)->operands()) {
           if (setShapeInfo(U.get(), Shape))
-            WorkList.push_back(U.get());
+            pushInstruction(U.get(), WorkList);
         }
       }
+      // After we discovered new shape info for new instructions in the
+      // worklist, we use their users as seeds for the next round of forward
+      // propagation.
+      for (size_t I = BeforeProcessingV; I != WorkList.size(); I++)
+        for (User *U : WorkList[I]->users())
+          if (isa<Instruction>(U) && V != U)
+            NewWorkList.push_back(cast<Instruction>(U));
     }
+    return NewWorkList;
   }
 
   bool Visit() {
     if (EnableShapePropagation) {
-      propagateShapeForward();
-      propagateShapeBackward();
+      SmallVector<Instruction *, 32> WorkList;
+
+      // Initially only the shape of matrix intrinsics is known.
+      // Initialize the work list with ops carrying shape information.
+      for (BasicBlock &BB : Func)
+        for (Instruction &Inst : BB) {
+          IntrinsicInst *II = dyn_cast<IntrinsicInst>(&Inst);
+          if (!II)
+            continue;
+
+          switch (II->getIntrinsicID()) {
+          case Intrinsic::matrix_multiply:
+          case Intrinsic::matrix_transpose:
+          case Intrinsic::matrix_columnwise_load:
+          case Intrinsic::matrix_columnwise_store:
+            WorkList.push_back(&Inst);
+            break;
+          default:
+            break;
+          }
+        }
+      // Propagate shapes until nothing changes any longer.
+      while (!WorkList.empty()) {
+        WorkList = propagateShapeForward(WorkList);
+        WorkList = propagateShapeBackward(WorkList);
+      }
     }
 
     ReversePostOrderTraversal<Function *> RPOT(&Func);
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
new file mode 100644
index 0000000000000..38200b3883dc0
--- /dev/null
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/propagate-multiple-iterations.ll
@@ -0,0 +1,84 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -lower-matrix-intrinsics -S < %s | FileCheck %s
+; RUN: opt -passes='lower-matrix-intrinsics' -S < %s | FileCheck %s
+
+
+; Make sure we propagate in multiple iterations. First, we back-propagate the
+; shape information from the transpose to %A, in the next iteration we
+; forward-propagate it to %Mul, and then back to %B.
+define <16 x double> @backpropagation_iterations(<16 x double>* %A.Ptr, <16 x double>* %B.Ptr) {
+; CHECK-LABEL: @backpropagation_iterations(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x double>* [[A_PTR:%.*]] to double*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast double* [[TMP1]] to <4 x double>*
+; CHECK-NEXT:    [[TMP3:%.*]] = load <4 x double>, <4 x double>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, double* [[TMP1]], i32 4
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast double* [[TMP5]] to <4 x double>*
+; CHECK-NEXT:    [[TMP7:%.*]] = load <4 x double>, <4 x double>* [[TMP6]], align 8
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, double* [[TMP1]], i32 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast double* [[TMP9]] to <4 x double>*
+; CHECK-NEXT:    [[TMP11:%.*]] = load <4 x double>, <4 x double>* [[TMP10]], align 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, double* [[TMP1]], i32 12
+; CHECK-NEXT:    [[TMP14:%.*]] = bitcast double* [[TMP13]] to <4 x double>*
+; CHECK-NEXT:    [[TMP15:%.*]] = load <4 x double>, <4 x double>* [[TMP14]], align 8
+; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x double> [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <4 x double> undef, double [[TMP16]], i64 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractelement <4 x double> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP18]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <4 x double> [[TMP11]], i64 0
+; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <4 x double> [[TMP19]], double [[TMP20]], i64 2
+; CHECK-NEXT:    [[TMP22:%.*]] = extractelement <4 x double> [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP23:%.*]] = insertelement <4 x double> [[TMP21]], double [[TMP22]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = extractelement <4 x double> [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <4 x double> undef, double [[TMP24]], i64 0
+; CHECK-NEXT:    [[TMP26:%.*]] = extractelement <4 x double> [[TMP7]], i64 1
+; CHECK-NEXT:    [[TMP27:%.*]] = insertelement <4 x double> [[TMP25]], double [[TMP26]], i64 1
+; CHECK-NEXT:    [[TMP28:%.*]] = extractelement <4 x double> [[TMP11]], i64 1
+; CHECK-NEXT:    [[TMP29:%.*]] = insertelement <4 x double> [[TMP27]], double [[TMP28]], i64 2
+; CHECK-NEXT:    [[TMP30:%.*]] = extractelement <4 x double> [[TMP15]], i64 1
+; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <4 x double> [[TMP29]], double [[TMP30]], i64 3
+; CHECK-NEXT:    [[TMP32:%.*]] = extractelement <4 x double> [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP33:%.*]] = insertelement <4 x double> undef, double [[TMP32]], i64 0
+; CHECK-NEXT:    [[TMP34:%.*]] = extractelement <4 x double> [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP35:%.*]] = insertelement <4 x double> [[TMP33]], double [[TMP34]], i64 1
+; CHECK-NEXT:    [[TMP36:%.*]] = extractelement <4 x double> [[TMP11]], i64 2
+; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <4 x double> [[TMP35]], double [[TMP36]], i64 2
+; CHECK-NEXT:    [[TMP38:%.*]] = extractelement <4 x double> [[TMP15]], i64 2
+; CHECK-NEXT:    [[TMP39:%.*]] = insertelement <4 x double> [[TMP37]], double [[TMP38]], i64 3
+; CHECK-NEXT:    [[TMP40:%.*]] = extractelement <4 x double> [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP41:%.*]] = insertelement <4 x double> undef, double [[TMP40]], i64 0
+; CHECK-NEXT:    [[TMP42:%.*]] = extractelement <4 x double> [[TMP7]], i64 3
+; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <4 x double> [[TMP41]], double [[TMP42]], i64 1
+; CHECK-NEXT:    [[TMP44:%.*]] = extractelement <4 x double> [[TMP11]], i64 3
+; CHECK-NEXT:    [[TMP45:%.*]] = insertelement <4 x double> [[TMP43]], double [[TMP44]], i64 2
+; CHECK-NEXT:    [[TMP46:%.*]] = extractelement <4 x double> [[TMP15]], i64 3
+; CHECK-NEXT:    [[TMP47:%.*]] = insertelement <4 x double> [[TMP45]], double [[TMP46]], i64 3
+; CHECK-NEXT:    [[TMP48:%.*]] = bitcast <16 x double>* [[B_PTR:%.*]] to double*
+; CHECK-NEXT:    [[TMP49:%.*]] = bitcast double* [[TMP48]] to <4 x double>*
+; CHECK-NEXT:    [[TMP50:%.*]] = load <4 x double>, <4 x double>* [[TMP49]], align 8
+; CHECK-NEXT:    [[TMP52:%.*]] = getelementptr double, double* [[TMP48]], i32 4
+; CHECK-NEXT:    [[TMP53:%.*]] = bitcast double* [[TMP52]] to <4 x double>*
+; CHECK-NEXT:    [[TMP54:%.*]] = load <4 x double>, <4 x double>* [[TMP53]], align 8
+; CHECK-NEXT:    [[TMP56:%.*]] = getelementptr double, double* [[TMP48]], i32 8
+; CHECK-NEXT:    [[TMP57:%.*]] = bitcast double* [[TMP56]] to <4 x double>*
+; CHECK-NEXT:    [[TMP58:%.*]] = load <4 x double>, <4 x double>* [[TMP57]], align 8
+; CHECK-NEXT:    [[TMP60:%.*]] = getelementptr double, double* [[TMP48]], i32 12
+; CHECK-NEXT:    [[TMP61:%.*]] = bitcast double* [[TMP60]] to <4 x double>*
+; CHECK-NEXT:    [[TMP62:%.*]] = load <4 x double>, <4 x double>* [[TMP61]], align 8
+; CHECK-NEXT:    [[TMP63:%.*]] = fmul <4 x double> [[TMP3]], [[TMP50]]
+; CHECK-NEXT:    [[TMP64:%.*]] = fmul <4 x double> [[TMP7]], [[TMP54]]
+; CHECK-NEXT:    [[TMP65:%.*]] = fmul <4 x double> [[TMP11]], [[TMP58]]
+; CHECK-NEXT:    [[TMP66:%.*]] = fmul <4 x double> [[TMP15]], [[TMP62]]
+; CHECK-NEXT:    [[TMP67:%.*]] = shufflevector <4 x double> [[TMP63]], <4 x double> [[TMP64]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP68:%.*]] = shufflevector <4 x double> [[TMP65]], <4 x double> [[TMP66]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    [[TMP69:%.*]] = shufflevector <8 x double> [[TMP67]], <8 x double> [[TMP68]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    ret <16 x double> [[TMP69]]
+;
+  %A = load <16 x double>, <16 x double>* %A.Ptr
+  %A.trans = tail call <16 x double> @llvm.matrix.transpose.v16f64(<16 x double> %A, i32 4, i32 4)
+  %B = load <16 x double>, <16 x double>* %B.Ptr
+  %Mul = fmul <16 x double> %A, %B
+  ret <16 x double> %Mul
+}
+
+declare <16 x double> @llvm.matrix.multiply.v16f64.v16f64.v16f64(<16 x double>, <16 x double>, i32 immarg, i32 immarg, i32 immarg)
+declare <16 x double> @llvm.matrix.transpose.v16f64(<16 x double>, i32 immarg, i32 immarg)

From 782ad91cc423bf602718e2bf9ffc59e55350463f Mon Sep 17 00:00:00 2001
From: Raphael Isemann <teemperor@gmail.com>
Date: Thu, 9 Jan 2020 12:09:44 +0100
Subject: [PATCH 40/89] [lldb] Fix that TestNoSuchArch.py was passing for the
 wrong reason

The command here failed due to the type in 'create' but the expect
did not actually check for the error message. This fixes the typo
and adds a check for the actuall error message we should see.
---
 .../commands/target/create-no-such-arch/TestNoSuchArch.py     | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py b/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
index a780ca2756647..4d7f0838f8776 100644
--- a/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
+++ b/lldb/packages/Python/lldbsuite/test/commands/target/create-no-such-arch/TestNoSuchArch.py
@@ -19,8 +19,8 @@ def test(self):
         # Check that passing an invalid arch via the command-line fails but
         # doesn't crash
         self.expect(
-            "target crete --arch nothingtoseehere %s" %
-            (exe), error=True)
+            "target create --arch nothingtoseehere %s" %
+            (exe), error=True, substrs=["error: invalid triple 'nothingtoseehere'"])
 
         # Check that passing an invalid arch via the SB API fails but doesn't
         # crash

From 0d5407987a36445f21f122656edccbb0d9234f62 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 9 Jan 2020 11:10:46 +0000
Subject: [PATCH 41/89] Fix MSVC unhandled enum warning. NFCI.

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 05d29fbadfe80..ee7a2e9204ace 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -11190,6 +11190,7 @@ bool checkContext<OMP_CTX_SET_device, OMP_CTX_kind, CodeGenModule &>(
     case llvm::Triple::wasm64:
     case llvm::Triple::renderscript32:
     case llvm::Triple::renderscript64:
+    case llvm::Triple::ve:
       return false;
     }
   }

From 795c38eb4df636d434a9821efecbfeb41ecba843 Mon Sep 17 00:00:00 2001
From: Andi-Bogdan Postelnicu <abpostelnicu@me.com>
Date: Thu, 9 Jan 2020 10:00:05 +0200
Subject: [PATCH 42/89] [clang-tidy] For checker
 `readability-misleading-indentation` update tests.

Summary: In D72333 we've introduced support for `if constexpr` but the test for uninstantiated template was not ready to land on windows platform since this target uses `-fdelayed-template-parsing` by default. This patch addresses this by passing `-fno-delayed-template-parsing` to the test.

Reviewers: JonasToth

Subscribers: xazax.hun, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D72438
---
 .../checkers/readability-misleading-indentation.cpp | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
index c3bd33d8ee7b8..aea0618d120db 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability-misleading-indentation.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy %s readability-misleading-indentation %t
+// RUN: %check_clang_tidy %s readability-misleading-indentation %t -- -- -fno-delayed-template-parsing
 
 void foo1();
 void foo2();
@@ -168,6 +168,17 @@ void mustFailNonTemplate() {
   // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: different indentation for 'if' and corresponding 'else' [readability-misleading-indentation]
 }
 
+template<bool b>
+void mustFailNoInsta() {
+  if constexpr (b) {
+    foo1();
+  }
+    else {
+      foo2();
+      // CHECK-MESSAGES: :[[@LINE-2]]:5: warning: different indentation for 'if' and corresponding 'else' [readability-misleading-indentation]
+  }
+}
+
 template<bool b>
 void mustPassNoInsta() {
   if constexpr (b) {

From 15c7fa4d11eeb50095ae571c645427b9a267bdee Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 9 Jan 2020 11:57:04 +0000
Subject: [PATCH 43/89] [ARM][MVE] Don't unroll intrinsic loops.

We don't unroll vector loops for MVE targets, but we miss the case
when loops only contain intrinsic calls. So just move the logic a
bit to catch this case.

Differential Revision: https://reviews.llvm.org/D72440
---
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  9 ++--
 .../Transforms/LoopUnroll/ARM/mve-nounroll.ll | 49 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 41ad8b0c04de4..7ff05034c1f25 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1235,6 +1235,11 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   unsigned Cost = 0;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      // Don't unroll vectorised loop. MVE does not benefit from it as much as
+      // scalar code.
+      if (I.getType()->isVectorTy())
+        return;
+
       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
         ImmutableCallSite CS(&I);
         if (const Function *F = CS.getCalledFunction()) {
@@ -1243,10 +1248,6 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         }
         return;
       }
-      // Don't unroll vectorised loop. MVE does not benefit from it as much as
-      // scalar code.
-      if (I.getType()->isVectorTy())
-        return;
 
       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                             I.value_op_end());
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
index 2067969a994ff..15016c1f7e7d9 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/mve-nounroll.ll
@@ -121,6 +121,55 @@ for.cond.cleanup:                                 ; preds = %for.cond1.for.cond.
   ret void
 }
 
+; Test that we don't unroll loops that only contain vector intrinsics.
+; CHECK-LABEL: test_intrinsics
+; CHECK: call <16 x i8> @llvm.arm.mve.sub
+; CHECK-NOT: call <16 x i8> @llvm.arm.mve.sub
+define dso_local arm_aapcs_vfpcc void @test_intrinsics(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) {
+entry:
+  %cmp8 = icmp eq i32 %N, 0
+  %tmp8 = add i32 %N, 15
+  %tmp9 = lshr i32 %tmp8, 4
+  %tmp10 = shl nuw i32 %tmp9, 4
+  %tmp11 = add i32 %tmp10, -16
+  %tmp12 = lshr i32 %tmp11, 4
+  %tmp13 = add nuw nsw i32 %tmp12, 1
+  br i1 %cmp8, label %for.cond.cleanup, label %vector.ph
+
+vector.ph:
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %tmp14 = phi i32 [ %tmp13, %vector.ph ], [ %tmp15, %vector.body ]
+  %0 = phi i32 [ %N, %vector.ph ], [ %2, %vector.body ]
+  %tmp = getelementptr inbounds i8, i8* %a, i32 %index
+  %1 = call <16 x i1> @llvm.arm.mve.vctp8(i32 %0)
+  %2 = sub i32 %0, 16
+  %tmp2 = bitcast i8* %tmp to <16 x i8>*
+  %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %1, <16 x i8> undef)
+  %tmp3 = getelementptr inbounds i8, i8* %b, i32 %index
+  %tmp4 = bitcast i8* %tmp3 to <16 x i8>*
+  %wide.masked.load2 = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp4, i32 4, <16 x i1> %1, <16 x i8> undef)
+  %sub = call <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8> %wide.masked.load2, <16 x i8> %wide.masked.load, <16 x i1> %1, <16 x i8> undef)
+  %tmp6 = getelementptr inbounds i8, i8* %c, i32 %index
+  %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
+  tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %sub, <16 x i8>* %tmp7, i32 4, <16 x i1> %1)
+  %index.next = add i32 %index, 16
+  %tmp15 = sub i32 %tmp14, 1
+  %tmp16 = icmp ne i32 %tmp15, 0
+  br i1 %tmp16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+declare <16 x i1> @llvm.arm.mve.vctp8(i32)
+declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32, <16 x i1>, <16 x i8>)
+declare <16 x i8> @llvm.arm.mve.sub.predicated.v16i8.v16i1(<16 x i8>, <16 x i8>, <16 x i1>, <16 x i8>)
+declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32, <16 x i1>)
+
+
 !0 = distinct !{!0, !1}
 !1 = !{!"llvm.loop.isvectorized", i32 1}
 !2 = distinct !{!2, !3, !1}

From e3e72a2619ccec9654ef9ad45e1e926841fbaa58 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 9 Jan 2020 11:48:06 +0000
Subject: [PATCH 44/89] Fix "pointer is null" static analyzer warnings. NFCI.

Assert that the pointers are non-null before dereferencing them.
---
 clang/lib/CodeGen/CGObjCGNU.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index 479cd8ec77cec..a27b6d4ed6374 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -1236,6 +1236,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
         // The first Interface we find may be a @class,
         // which should only be treated as the source of
         // truth in the absence of a true declaration.
+        assert(OID && "Failed to find ObjCInterfaceDecl");
         const ObjCInterfaceDecl *OIDDef = OID->getDefinition();
         if (OIDDef != nullptr)
           OID = OIDDef;
@@ -3036,6 +3037,7 @@ llvm::Value *CGObjCGNU::GenerateProtocolRef(CodeGenFunction &CGF,
   llvm::Constant *&protocol = ExistingProtocols[PD->getNameAsString()];
   if (!protocol)
     GenerateProtocol(PD);
+  assert(protocol && "Unknown protocol");
   llvm::Type *T =
     CGM.getTypes().ConvertType(CGM.getContext().getObjCProtoType());
   return CGF.Builder.CreateBitCast(protocol, llvm::PointerType::getUnqual(T));

From e315ce2bd3a5347ab58d50fab2d449e8a4e60e00 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 9 Jan 2020 12:05:30 +0000
Subject: [PATCH 45/89] Fix "pointer is null" static analyzer warnings. NFCI.

Use castAs<> instead of getAs<> since the pointer is dereferenced immediately below and castAs will perform the null assertion for us.
---
 clang/lib/Sema/SemaCoroutine.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 271c4a10f3e44..6dc9e342beb92 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1228,7 +1228,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
     return false;
 
   if (RequiresNoThrowAlloc) {
-    const auto *FT = OperatorNew->getType()->getAs<FunctionProtoType>();
+    const auto *FT = OperatorNew->getType()->castAs<FunctionProtoType>();
     if (!FT->isNothrow(/*ResultIfDependent*/ false)) {
       S.Diag(OperatorNew->getLocation(),
              diag::err_coroutine_promise_new_requires_nothrow)
@@ -1281,7 +1281,7 @@ bool CoroutineStmtBuilder::makeNewAndDeleteExpr() {
 
   // Check if we need to pass the size.
   const auto *OpDeleteType =
-      OpDeleteQualType.getTypePtr()->getAs<FunctionProtoType>();
+      OpDeleteQualType.getTypePtr()->castAs<FunctionProtoType>();
   if (OpDeleteType->getNumParams() > 1)
     DeleteArgs.push_back(FrameSize);
 

From 9bb01efa49ca7f069bc7acba7e4c9bf64d972e79 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Tue, 17 Dec 2019 17:12:29 +0100
Subject: [PATCH 46/89] [lldb/DWARF] Add is_dwo member to DWARFUnit

Summary:
A skeleton unit can easily be detected by checking the m_dwo_symbol_file
member, but we cannot tell a split unit from a normal unit from the
"inside", which is sometimes useful.

This patch adds a m_is_dwo member to enable this, and align the code
with llvm::DWARFUnit. Right now it's only used to avoid creating a split
unit inside another split unit (which removes one override from
SymbolFileDWARFDwo and brings us a step closer to deleting it), but my
main motivation is fixing the handling of location lists in mixed v4&v5
files. This comes in a separate patch.

Reviewers: JDevlieghere, aprantl, clayborg

Subscribers: dblaikie, lldb-commits

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D71750
---
 .../Plugins/SymbolFile/DWARF/DWARFCompileUnit.h  |  4 ++--
 .../Plugins/SymbolFile/DWARF/DWARFContext.h      |  4 ++--
 .../Plugins/SymbolFile/DWARF/DWARFTypeUnit.h     |  4 ++--
 .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp       | 16 ++++++++++------
 lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h |  3 ++-
 .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.h   |  2 +-
 .../SymbolFile/DWARF/SymbolFileDWARFDwo.h        |  6 ------
 7 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
index 75647dbb082f2..454637ef981c7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFCompileUnit.h
@@ -24,8 +24,8 @@ class DWARFCompileUnit : public DWARFUnit {
   DWARFCompileUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                    const DWARFUnitHeader &header,
                    const DWARFAbbreviationDeclarationSet &abbrevs,
-                   DIERef::Section section)
-      : DWARFUnit(dwarf, uid, header, abbrevs, section) {}
+                   DIERef::Section section, bool is_dwo)
+      : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
   DISALLOW_COPY_AND_ASSIGN(DWARFCompileUnit);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
index add0423840397..24baac90aa445 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFContext.h
@@ -41,8 +41,6 @@ class DWARFContext {
   SectionData m_data_debug_str_offsets;
   SectionData m_data_debug_types;
 
-  bool isDwo() { return m_dwo_section_list != nullptr; }
-
   const DWARFDataExtractor &
   LoadOrGetSection(lldb::SectionType main_section_type,
                    llvm::Optional<lldb::SectionType> dwo_section_type,
@@ -67,6 +65,8 @@ class DWARFContext {
   const DWARFDataExtractor &getOrLoadStrOffsetsData();
   const DWARFDataExtractor &getOrLoadDebugTypesData();
 
+  bool isDwo() { return m_dwo_section_list != nullptr; }
+
   llvm::DWARFContext &GetAsLLVM();
 };
 } // namespace lldb_private
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
index 6ff73ecd8efa3..8967509c081aa 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFTypeUnit.h
@@ -28,8 +28,8 @@ class DWARFTypeUnit : public DWARFUnit {
   DWARFTypeUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                 const DWARFUnitHeader &header,
                 const DWARFAbbreviationDeclarationSet &abbrevs,
-                DIERef::Section section)
-      : DWARFUnit(dwarf, uid, header, abbrevs, section) {}
+                DIERef::Section section, bool is_dwo)
+      : DWARFUnit(dwarf, uid, header, abbrevs, section, is_dwo) {}
 
   friend class DWARFUnit;
 };
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index d8d70bae0232d..32f0f89c042a0 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -32,9 +32,9 @@ extern int g_verbose;
 DWARFUnit::DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
                      const DWARFUnitHeader &header,
                      const DWARFAbbreviationDeclarationSet &abbrevs,
-                     DIERef::Section section)
+                     DIERef::Section section, bool is_dwo)
     : UserID(uid), m_dwarf(dwarf), m_header(header), m_abbrevs(&abbrevs),
-      m_cancel_scopes(false), m_section(section) {}
+      m_cancel_scopes(false), m_section(section), m_is_dwo(is_dwo) {}
 
 DWARFUnit::~DWARFUnit() = default;
 
@@ -336,6 +336,9 @@ void DWARFUnit::AddUnitDIE(const DWARFDebugInfoEntry &cu_die) {
     }
   }
 
+  if (m_is_dwo)
+    return;
+
   std::unique_ptr<SymbolFileDWARFDwo> dwo_symbol_file =
       m_dwarf.GetDwoSymbolFileForCompileUnit(*this, cu_die);
   if (!dwo_symbol_file)
@@ -872,11 +875,12 @@ DWARFUnit::extract(SymbolFileDWARF &dwarf, user_id_t uid,
     return llvm::make_error<llvm::object::GenericBinaryError>(
         "No abbrev exists at the specified offset.");
 
+  bool is_dwo = dwarf.GetDWARFContext().isDwo();
   if (expected_header->IsTypeUnit())
-    return DWARFUnitSP(
-        new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs, section));
-  return DWARFUnitSP(
-      new DWARFCompileUnit(dwarf, uid, *expected_header, *abbrevs, section));
+    return DWARFUnitSP(new DWARFTypeUnit(dwarf, uid, *expected_header, *abbrevs,
+                                         section, is_dwo));
+  return DWARFUnitSP(new DWARFCompileUnit(dwarf, uid, *expected_header,
+                                          *abbrevs, section, is_dwo));
 }
 
 const lldb_private::DWARFDataExtractor &DWARFUnit::GetData() const {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index d53ed756fe05d..5e94dc106ce60 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -247,7 +247,7 @@ class DWARFUnit : public lldb_private::UserID {
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
             const DWARFUnitHeader &header,
             const DWARFAbbreviationDeclarationSet &abbrevs,
-            DIERef::Section section);
+            DIERef::Section section, bool is_dwo);
 
   llvm::Error ExtractHeader(SymbolFileDWARF &dwarf,
                             const lldb_private::DWARFDataExtractor &data,
@@ -314,6 +314,7 @@ class DWARFUnit : public lldb_private::UserID {
   llvm::Optional<llvm::DWARFListTableHeader> m_loclist_table_header;
 
   const DIERef::Section m_section;
+  bool m_is_dwo;
 
 private:
   void ParseProducerInfo();
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index 35b18f4b02b35..bf9a6e5b237b7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -287,7 +287,7 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   lldb::user_id_t GetUID(DIERef ref);
 
-  virtual std::unique_ptr<SymbolFileDWARFDwo>
+  std::unique_ptr<SymbolFileDWARFDwo>
   GetDwoSymbolFileForCompileUnit(DWARFUnit &dwarf_cu,
                                  const DWARFDebugInfoEntry &cu_die);
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index d07209784dd7e..641fd1f2ce32d 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -47,12 +47,6 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
   DWARFDIE
   GetDIE(const DIERef &die_ref) override;
 
-  std::unique_ptr<SymbolFileDWARFDwo>
-  GetDwoSymbolFileForCompileUnit(DWARFUnit &dwarf_cu,
-                                 const DWARFDebugInfoEntry &cu_die) override {
-    return nullptr;
-  }
-
   DWARFCompileUnit *GetBaseCompileUnit() override { return &m_base_dwarf_cu; }
 
   llvm::Optional<uint32_t> GetDwoNum() override { return GetID() >> 32; }

From cd5da94d80b2b0f2bdb2d0157e24705a4cbd2a4e Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Fri, 20 Dec 2019 09:42:37 +0100
Subject: [PATCH 47/89] [lldb/DWARF] Fix mixed v4+v5 location lists

Summary:
Our code was expecting that a single (symbol) file contains only one
kind of location lists. This is not correct (on non-apple platforms, at
least) as a file can compile units with different dwarf versions.

This patch moves the deteremination of location list flavour down to the
compile unit level, fixing this problem. I have also tried to rougly
align the code with the llvm DWARFUnit. Fully matching the API is not
possible because of how lldb's DWARFExpression lives separately from the
rest of the DWARF code, but this is at least a step in the right
direction.

Reviewers: JDevlieghere, aprantl, clayborg

Subscribers: dblaikie, lldb-commits

Tags: #lldb

Differential Revision: https://reviews.llvm.org/D71751
---
 .../include/lldb/Expression/DWARFExpression.h |   9 -
 lldb/source/Expression/DWARFExpression.cpp    |  35 +---
 .../SymbolFile/DWARF/DWARFDebugInfoEntry.cpp  |   6 +-
 .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp    |  16 ++
 .../Plugins/SymbolFile/DWARF/DWARFUnit.h      |   7 +
 .../SymbolFile/DWARF/SymbolFileDWARF.cpp      |  16 +-
 .../SymbolFile/DWARF/SymbolFileDWARF.h        |   6 -
 .../SymbolFile/DWARF/SymbolFileDWARFDwo.cpp   |   7 -
 .../SymbolFile/DWARF/SymbolFileDWARFDwo.h     |   3 -
 .../SymbolFile/DWARF/debug_loc_and_loclists.s | 154 ++++++++++++++++++
 10 files changed, 184 insertions(+), 75 deletions(-)
 create mode 100644 lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s

diff --git a/lldb/include/lldb/Expression/DWARFExpression.h b/lldb/include/lldb/Expression/DWARFExpression.h
index 1e32957443fd3..bfae142d5e016 100644
--- a/lldb/include/lldb/Expression/DWARFExpression.h
+++ b/lldb/include/lldb/Expression/DWARFExpression.h
@@ -34,15 +34,6 @@ namespace lldb_private {
 /// location expression or a location list and interprets it.
 class DWARFExpression {
 public:
-  enum LocationListFormat : uint8_t {
-    NonLocationList,     // Not a location list
-    RegularLocationList, // Location list format used in non-split dwarf files
-    SplitDwarfLocationList, // Location list format used in pre-DWARF v5 split
-                            // dwarf files (.debug_loc.dwo)
-    LocLists,               // Location list format used in DWARF v5
-                            // (.debug_loclists/.debug_loclists.dwo).
-  };
-
   DWARFExpression();
 
   /// Constructor
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index c67e35b145189..69c84640ef93a 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -54,29 +54,6 @@ ReadAddressFromDebugAddrSection(const DWARFUnit *dwarf_cu,
   return LLDB_INVALID_ADDRESS;
 }
 
-/// Return the location list parser for the given format.
-static std::unique_ptr<llvm::DWARFLocationTable>
-GetLocationTable(DWARFExpression::LocationListFormat format, const DataExtractor &data) {
-  llvm::DWARFDataExtractor llvm_data(
-      toStringRef(data.GetData()),
-      data.GetByteOrder() == lldb::eByteOrderLittle, data.GetAddressByteSize());
-
-  switch (format) {
-  case DWARFExpression::NonLocationList:
-    return nullptr;
-  // DWARF<=4 .debug_loc
-  case DWARFExpression::RegularLocationList:
-    return std::make_unique<llvm::DWARFDebugLoc>(llvm_data);
-  // Non-standard DWARF 4 extension (fission) .debug_loc.dwo
-  case DWARFExpression::SplitDwarfLocationList:
-  // DWARF 5 .debug_loclists(.dwo)
-  case DWARFExpression::LocLists:
-    return std::make_unique<llvm::DWARFDebugLoclists>(
-        llvm_data, format == DWARFExpression::LocLists ? 5 : 4);
-  }
-  llvm_unreachable("Invalid LocationListFormat!");
-}
-
 // DWARFExpression constructor
 DWARFExpression::DWARFExpression()
     : m_module_wp(), m_data(), m_dwarf_cu(nullptr),
@@ -157,10 +134,8 @@ void DWARFExpression::GetDescription(Stream *s, lldb::DescriptionLevel level,
   if (IsLocationList()) {
     // We have a location list
     lldb::offset_t offset = 0;
-    std::unique_ptr<llvm::DWARFLocationTable> loctable_up = GetLocationTable(
-        m_dwarf_cu->GetSymbolFileDWARF().GetLocationListFormat(), m_data);
-    if (!loctable_up)
-      return;
+    std::unique_ptr<llvm::DWARFLocationTable> loctable_up =
+        m_dwarf_cu->GetLocationTable(m_data);
 
     llvm::MCRegisterInfo *MRI = abi ? &abi->GetMCRegisterInfo() : nullptr;
 
@@ -2812,10 +2787,8 @@ DWARFExpression::GetLocationExpression(addr_t load_function_start,
                                        addr_t addr) const {
   Log *log = GetLogIfAllCategoriesSet(LIBLLDB_LOG_EXPRESSIONS);
 
-  std::unique_ptr<llvm::DWARFLocationTable> loctable_up = GetLocationTable(
-      m_dwarf_cu->GetSymbolFileDWARF().GetLocationListFormat(), m_data);
-  if (!loctable_up)
-    return llvm::None;
+  std::unique_ptr<llvm::DWARFLocationTable> loctable_up =
+      m_dwarf_cu->GetLocationTable(m_data);
   llvm::Optional<DataExtractor> result;
   uint64_t offset = 0;
   auto lookup_addr =
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
index 5612c59059bed..5b95912909ee9 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfoEntry.cpp
@@ -344,7 +344,7 @@ bool DWARFDebugInfoEntry::GetDIENamesAndRanges(
               *frame_base = DWARFExpression(
                   module, DataExtractor(data, block_offset, block_length), cu);
             } else {
-              DataExtractor data = dwarf.DebugLocData();
+              DataExtractor data = cu->GetLocationData();
               const dw_offset_t offset = form_value.Unsigned();
               if (data.ValidOffset(offset)) {
                 data = DataExtractor(data, offset, data.GetByteSize() - offset);
@@ -478,8 +478,6 @@ void DWARFDebugInfoEntry::DumpAttribute(
 
   s.PutCString("( ");
 
-  SymbolFileDWARF &dwarf = cu->GetSymbolFileDWARF();
-
   // Check to see if we have any special attribute formatters
   switch (attr) {
   case DW_AT_stmt_list:
@@ -509,7 +507,7 @@ void DWARFDebugInfoEntry::DumpAttribute(
       // We have a location list offset as the value that is the offset into
       // the .debug_loc section that describes the value over it's lifetime
       uint64_t debug_loc_offset = form_value.Unsigned();
-      DWARFExpression::PrintDWARFLocationList(s, cu, dwarf.DebugLocData(),
+      DWARFExpression::PrintDWARFLocationList(s, cu, cu->GetLocationData(),
                                               debug_loc_offset);
     }
   } break;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 32f0f89c042a0..dcb38da3c43ee 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -462,6 +462,22 @@ void DWARFUnit::SetLoclistsBase(dw_addr_t loclists_base) {
   }
 }
 
+std::unique_ptr<llvm::DWARFLocationTable>
+DWARFUnit::GetLocationTable(const DataExtractor &data) const {
+  llvm::DWARFDataExtractor llvm_data(
+      toStringRef(data.GetData()),
+      data.GetByteOrder() == lldb::eByteOrderLittle, data.GetAddressByteSize());
+
+  if (m_is_dwo || GetVersion() >= 5)
+    return std::make_unique<llvm::DWARFDebugLoclists>(llvm_data, GetVersion());
+  return std::make_unique<llvm::DWARFDebugLoc>(llvm_data);
+}
+
+const DWARFDataExtractor &DWARFUnit::GetLocationData() const {
+  return GetVersion() >= 5 ? GetSymbolFileDWARF().get_debug_loclists_data()
+                           : GetSymbolFileDWARF().get_debug_loc_data();
+}
+
 void DWARFUnit::SetRangesBase(dw_addr_t ranges_base) {
   m_ranges_base = ranges_base;
 
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 5e94dc106ce60..6bee4ab8be8e7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -243,6 +243,13 @@ class DWARFUnit : public lldb_private::UserID {
     return *Offset + m_loclists_base;
   }
 
+  /// Return the location table for parsing the given location list data. The
+  /// format is chosen according to the unit type. Never returns null.
+  std::unique_ptr<llvm::DWARFLocationTable>
+  GetLocationTable(const lldb_private::DataExtractor &data) const;
+
+  const lldb_private::DWARFDataExtractor &GetLocationData() const;
+
 protected:
   DWARFUnit(SymbolFileDWARF &dwarf, lldb::user_id_t uid,
             const DWARFUnitHeader &header,
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
index 53339ea31e71a..0792260e36fe5 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp
@@ -597,13 +597,6 @@ void SymbolFileDWARF::LoadSectionData(lldb::SectionType sect_type,
   m_objfile_sp->ReadSectionData(section_sp.get(), data);
 }
 
-const DWARFDataExtractor &SymbolFileDWARF::DebugLocData() {
-  const DWARFDataExtractor &debugLocData = get_debug_loc_data();
-  if (debugLocData.GetByteSize() > 0)
-    return debugLocData;
-  return get_debug_loclists_data();
-}
-
 const DWARFDataExtractor &SymbolFileDWARF::get_debug_loc_data() {
   return GetCachedSectionData(eSectionTypeDWARFDebugLoc, m_data_debug_loc);
 }
@@ -3361,7 +3354,7 @@ VariableSP SymbolFileDWARF::ParseVariableDIE(const SymbolContext &sc,
                   module, DataExtractor(data, block_offset, block_length),
                   die.GetCU());
             } else {
-              DataExtractor data = DebugLocData();
+              DataExtractor data = die.GetCU()->GetLocationData();
               dw_offset_t offset = form_value.Unsigned();
               if (form_value.Form() == DW_FORM_loclistx)
                 offset = die.GetCU()->GetLoclistOffset(offset).getValueOr(-1);
@@ -3978,13 +3971,6 @@ SymbolFileDWARFDebugMap *SymbolFileDWARF::GetDebugMapSymfile() {
   return m_debug_map_symfile;
 }
 
-DWARFExpression::LocationListFormat
-SymbolFileDWARF::GetLocationListFormat() const {
-  if (m_data_debug_loclists.m_data.GetByteSize() > 0)
-    return DWARFExpression::LocLists;
-  return DWARFExpression::RegularLocationList;
-}
-
 SymbolFileDWARFDwp *SymbolFileDWARF::GetDwpSymbolFile() {
   llvm::call_once(m_dwp_symfile_once_flag, [this]() {
     ModuleSpec module_spec;
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
index bf9a6e5b237b7..f816dd77800e4 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.h
@@ -21,7 +21,6 @@
 
 #include "lldb/Core/UniqueCStringMap.h"
 #include "lldb/Core/dwarf.h"
-#include "lldb/Expression/DWARFExpression.h"
 #include "lldb/Symbol/DebugMacros.h"
 #include "lldb/Symbol/SymbolContext.h"
 #include "lldb/Symbol/SymbolFile.h"
@@ -236,8 +235,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   DWARFDebugRanges *GetDebugRanges();
 
-  const lldb_private::DWARFDataExtractor &DebugLocData();
-
   static bool SupportedVersion(uint16_t version);
 
   DWARFDIE
@@ -260,9 +257,6 @@ class SymbolFileDWARF : public lldb_private::SymbolFile,
 
   virtual lldb::CompUnitSP ParseCompileUnit(DWARFCompileUnit &dwarf_cu);
 
-  virtual lldb_private::DWARFExpression::LocationListFormat
-  GetLocationListFormat() const;
-
   lldb::ModuleSP GetExternalModule(lldb_private::ConstString name);
 
   typedef std::map<lldb_private::ConstString, lldb::ModuleSP>
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
index 331417fe5cd12..f75f06f31e2da 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.cpp
@@ -137,13 +137,6 @@ SymbolFileDWARF &SymbolFileDWARFDwo::GetBaseSymbolFile() {
   return m_base_dwarf_cu.GetSymbolFileDWARF();
 }
 
-DWARFExpression::LocationListFormat
-SymbolFileDWARFDwo::GetLocationListFormat() const {
-  return m_base_dwarf_cu.GetVersion() >= 5
-             ? DWARFExpression::LocLists
-             : DWARFExpression::SplitDwarfLocationList;
-}
-
 llvm::Expected<TypeSystem &>
 SymbolFileDWARFDwo::GetTypeSystemForLanguage(LanguageType language) {
   return GetBaseSymbolFile().GetTypeSystemForLanguage(language);
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
index 641fd1f2ce32d..0855dba044e4f 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/SymbolFileDWARFDwo.h
@@ -35,9 +35,6 @@ class SymbolFileDWARFDwo : public SymbolFileDWARF {
   DWARFUnit *
   GetDWARFCompileUnit(lldb_private::CompileUnit *comp_unit) override;
 
-  lldb_private::DWARFExpression::LocationListFormat
-  GetLocationListFormat() const override;
-
   size_t GetObjCMethodDIEOffsets(lldb_private::ConstString class_name,
                                  DIEArray &method_die_offsets) override;
 
diff --git a/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s b/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s
new file mode 100644
index 0000000000000..05bccbe78aabf
--- /dev/null
+++ b/lldb/test/Shell/SymbolFile/DWARF/debug_loc_and_loclists.s
@@ -0,0 +1,154 @@
+# Test that we can handle DWARF 4 and 5 location lists in the same object file
+# (but different compile units).
+
+# REQUIRES: x86
+
+# RUN: llvm-mc -triple=x86_64-pc-linux -filetype=obj %s > %t
+# RUN: %lldb %t -o "image lookup -v -s loc" -o "image lookup -v -s loclists" \
+# RUN:   -o exit | FileCheck %s
+
+
+# CHECK-LABEL: image lookup -v -s loc
+# CHECK: Variable: {{.*}}, name = "x0", type = "int", location = DW_OP_reg5 RDI,
+
+# CHECK-LABEL: image lookup -v -s loclists
+# CHECK: Variable: {{.*}}, name = "x1", type = "int", location = DW_OP_reg0 RAX,
+
+
+loc:
+        nop
+.Lloc_end:
+
+loclists:
+        nop
+.Lloclists_end:
+
+        .section        .debug_loc,"",@progbits
+.Lloc_list:
+        .quad loc-loc
+        .quad .Lloc_end-loc
+        .short 1
+        .byte   85                      # super-register DW_OP_reg5
+        .quad 0
+        .quad 0
+
+        .section        .debug_loclists,"",@progbits
+        .long   .Ldebug_loclist_table_end0-.Ldebug_loclist_table_start0 # Length
+.Ldebug_loclist_table_start0:
+        .short  5                       # Version
+        .byte   8                       # Address size
+        .byte   0                       # Segment selector size
+        .long   0                       # Offset entry count
+
+.Lloclists_list:
+        .byte   4                       # DW_LLE_offset_pair
+        .uleb128 loclists-loclists
+        .uleb128 .Lloclists_end-loclists
+        .uleb128 1
+        .byte   80                      # super-register DW_OP_reg0
+        .byte   0                       # DW_LLE_end_of_list
+.Ldebug_loclist_table_end0:
+
+        .section        .debug_abbrev,"",@progbits
+        .byte   1                       # Abbreviation Code
+        .byte   17                      # DW_TAG_compile_unit
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   37                      # DW_AT_producer
+        .byte   8                       # DW_FORM_string
+        .byte   19                      # DW_AT_language
+        .byte   5                       # DW_FORM_data2
+        .byte   17                      # DW_AT_low_pc
+        .byte   1                       # DW_FORM_addr
+        .byte   18                      # DW_AT_high_pc
+        .byte   6                       # DW_FORM_data4
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   2                       # Abbreviation Code
+        .byte   46                      # DW_TAG_subprogram
+        .byte   1                       # DW_CHILDREN_yes
+        .byte   17                      # DW_AT_low_pc
+        .byte   1                       # DW_FORM_addr
+        .byte   18                      # DW_AT_high_pc
+        .byte   6                       # DW_FORM_data4
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   73                      # DW_AT_type
+        .byte   16                      # DW_FORM_ref_addr
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   3                       # Abbreviation Code
+        .byte   5                       # DW_TAG_formal_parameter
+        .byte   0                       # DW_CHILDREN_no
+        .byte   2                       # DW_AT_location
+        .byte   23                      # DW_FORM_sec_offset
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   73                      # DW_AT_type
+        .byte   16                      # DW_FORM_ref_addr
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   4                       # Abbreviation Code
+        .byte   36                      # DW_TAG_base_type
+        .byte   0                       # DW_CHILDREN_no
+        .byte   3                       # DW_AT_name
+        .byte   8                       # DW_FORM_string
+        .byte   62                      # DW_AT_encoding
+        .byte   11                      # DW_FORM_data1
+        .byte   11                      # DW_AT_byte_size
+        .byte   11                      # DW_FORM_data1
+        .byte   0                       # EOM(1)
+        .byte   0                       # EOM(2)
+        .byte   0                       # EOM(3)
+
+        .section        .debug_info,"",@progbits
+        .long   .Lloc_cu_end-.Lloc_cu_start # Length of Unit
+.Lloc_cu_start:
+        .short  4                       # DWARF version number
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   8                       # Address Size (in bytes)
+        .byte   1                       # Abbrev [1] 0xb:0x50 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .quad   loc                     # DW_AT_low_pc
+        .long   .Lloc_end-loc           # DW_AT_high_pc
+        .byte   2                       # Abbrev [2] 0x2a:0x29 DW_TAG_subprogram
+        .quad   loc                     # DW_AT_low_pc
+        .long   .Lloc_end-loc           # DW_AT_high_pc
+        .asciz  "loc"                   # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   3                       # Abbrev [3] DW_TAG_formal_parameter
+        .long   .Lloc_list              # DW_AT_location
+        .asciz  "x0"                    # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   0                       # End Of Children Mark
+.Lint:
+        .byte   4                       # Abbrev [4] 0x53:0x7 DW_TAG_base_type
+        .asciz  "int"                   # DW_AT_name
+        .byte   5                       # DW_AT_encoding
+        .byte   4                       # DW_AT_byte_size
+        .byte   0                       # End Of Children Mark
+.Lloc_cu_end:
+
+        .long   .Lloclists_cu_end-.Lloclists_cu_start # Length of Unit
+.Lloclists_cu_start:
+        .short  5                       # DWARF version number
+        .byte   1                       # DWARF Unit Type
+        .byte   8                       # Address Size (in bytes)
+        .long   .debug_abbrev           # Offset Into Abbrev. Section
+        .byte   1                       # Abbrev [1] 0xb:0x50 DW_TAG_compile_unit
+        .asciz  "Hand-written DWARF"    # DW_AT_producer
+        .short  12                      # DW_AT_language
+        .quad   loclists                # DW_AT_low_pc
+        .long   .Lloclists_end-loclists # DW_AT_high_pc
+        .byte   2                       # Abbrev [2] 0x2a:0x29 DW_TAG_subprogram
+        .quad   loclists                # DW_AT_low_pc
+        .long   .Lloclists_end-loclists # DW_AT_high_pc
+        .asciz  "loclists"              # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   3                       # Abbrev [3] DW_TAG_formal_parameter
+        .long   .Lloclists_list         # DW_AT_location
+        .asciz  "x1"                    # DW_AT_name
+        .long   .Lint                   # DW_AT_type
+        .byte   0                       # End Of Children Mark
+        .byte   0                       # End Of Children Mark
+.Lloclists_cu_end:

From d462185e8daa49889c31c8f5568749e379a5ddf9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 9 Jan 2020 12:33:37 +0000
Subject: [PATCH 48/89] Fix "pointer is null" static analyzer warning. NFCI.

Use cast<> instead of dyn_cast<> since we know that the pointer should be valid (and is dereferenced immediately below).
---
 clang/lib/Sema/SemaTemplateInstantiateDecl.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index a8fc77fff7d50..98e05f099198a 100755
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -655,11 +655,10 @@ void Sema::InstantiateAttrs(const MultiLevelTemplateArgumentList &TemplateArgs,
       LateAttrs->push_back(LateInstantiatedAttribute(TmplAttr, Saved, New));
     } else {
       // Allow 'this' within late-parsed attributes.
-      NamedDecl *ND = dyn_cast<NamedDecl>(New);
-      CXXRecordDecl *ThisContext =
-          dyn_cast_or_null<CXXRecordDecl>(ND->getDeclContext());
+      auto *ND = cast<NamedDecl>(New);
+      auto *ThisContext = dyn_cast_or_null<CXXRecordDecl>(ND->getDeclContext());
       CXXThisScopeRAII ThisScope(*this, ThisContext, Qualifiers(),
-                                 ND && ND->isCXXInstanceMember());
+                                 ND->isCXXInstanceMember());
 
       Attr *NewAttr = sema::instantiateTemplateAttribute(TmplAttr, Context,
                                                          *this, TemplateArgs);

From 25dd99fab205619ef90d0da23bcfba3c9f762e5e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 9 Jan 2020 12:35:21 +0000
Subject: [PATCH 49/89] Fix "pointer is null" static analyzer warning. NFCI.

Use cast<> instead of dyn_cast<> since we know that the pointer should be valid (and is dereferenced immediately).
---
 llvm/lib/IR/Verifier.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 50648ba17945b..d232946af2942 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4779,7 +4779,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
 
   case Intrinsic::experimental_constrained_fcmp:
   case Intrinsic::experimental_constrained_fcmps: {
-    auto Pred = dyn_cast<ConstrainedFPCmpIntrinsic>(&FPI)->getPredicate();
+    auto Pred = cast<ConstrainedFPCmpIntrinsic>(&FPI)->getPredicate();
     Assert(CmpInst::isFPPredicate(Pred),
            "invalid predicate for constrained FP comparison intrinsic", &FPI);
     break;

From 9c91d79dadc660cb6a0ec736389341debd8cd118 Mon Sep 17 00:00:00 2001
From: Sam Parker <sam.parker@arm.com>
Date: Thu, 9 Jan 2020 12:52:50 +0000
Subject: [PATCH 50/89] [NFC][ARM] LowOverheadLoop comments

Add a comment describing the dependencies of the pass.
---
 llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 31a98d86a54d2..6c45eecf0c23d 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -19,6 +19,22 @@
 /// which determines whether we can generated the tail-predicated low-overhead
 /// loop form.
 ///
+/// Assumptions and Dependencies:
+/// Low-overhead loops are constructed and executed using a setup instruction:
+/// DLS, WLS, DLSTP or WLSTP and an instruction that loops back: LE or LETP.
+/// WLS(TP) and LE(TP) are branching instructions with a (large) limited range
+/// but fixed polarity: WLS can only branch forwards and LE can only branch
+/// backwards. These restrictions mean that this pass is dependent upon block
+/// layout and block sizes, which is why it's the last pass to run. The same is
+/// true for ConstantIslands, but this pass does not increase the size of the
+/// basic blocks, nor does it change the CFG. Instructions are mainly removed
+/// during the transform and pseudo instructions are replaced by real ones. In
+/// some cases, when we have to revert to a 'normal' loop, we have to introduce
+/// multiple instructions for a single pseudo (see RevertWhile and
+/// RevertLoopEnd). To handle this situation, t2WhileLoopStart and t2LoopEnd
+/// are defined to be as large as this maximum sequence of replacement
+/// instructions.
+///
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"

From b65b1f322bd88513586a4539d2b5f18aeb698f3f Mon Sep 17 00:00:00 2001
From: Saar Raz <saar@raz.email>
Date: Thu, 9 Jan 2020 15:07:51 +0200
Subject: [PATCH 51/89] [Concepts] Function trailing requires clauses

Function trailing requires clauses now parsed, supported in overload resolution and when calling, referencing and taking the address of functions or function templates.

Differential Revision: https://reviews.llvm.org/D43357
---
 clang/include/clang/AST/ASTLambda.h           |  11 +
 clang/include/clang/AST/ASTNodeTraverser.h    |   3 +
 clang/include/clang/AST/Decl.h                |  43 ++-
 clang/include/clang/AST/DeclCXX.h             |  31 +-
 clang/include/clang/AST/RecursiveASTVisitor.h |   5 +
 .../clang/Basic/DiagnosticParseKinds.td       |  15 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |  18 +
 clang/include/clang/Parse/Parser.h            |  36 +-
 clang/include/clang/Sema/DeclSpec.h           |  23 +-
 clang/include/clang/Sema/Overload.h           |   6 +-
 clang/include/clang/Sema/Sema.h               |  54 ++-
 clang/include/clang/Sema/SemaConcept.h        | 145 +++++++
 clang/lib/AST/ASTImporter.cpp                 |  20 +-
 clang/lib/AST/Decl.cpp                        |  43 ++-
 clang/lib/AST/DeclCXX.cpp                     |  38 +-
 clang/lib/AST/DeclPrinter.cpp                 |   5 +
 clang/lib/AST/DeclTemplate.cpp                |  11 +-
 clang/lib/Parse/ParseDecl.cpp                 | 119 ++++--
 clang/lib/Parse/ParseDeclCXX.cpp              |  61 +++
 clang/lib/Parse/ParseExpr.cpp                 | 304 ++++++++++++---
 clang/lib/Parse/ParseExprCXX.cpp              |  30 +-
 clang/lib/Parse/ParseOpenMP.cpp               |   6 +-
 clang/lib/Parse/ParseTemplate.cpp             |  10 +-
 clang/lib/Parse/ParseTentative.cpp            |   5 +-
 clang/lib/Sema/SemaCast.cpp                   |   2 +-
 clang/lib/Sema/SemaConcept.cpp                | 365 ++++++++++--------
 clang/lib/Sema/SemaDecl.cpp                   |  39 +-
 clang/lib/Sema/SemaDeclAttr.cpp               |   3 +-
 clang/lib/Sema/SemaDeclCXX.cpp                |  28 +-
 clang/lib/Sema/SemaExpr.cpp                   |  31 +-
 clang/lib/Sema/SemaLambda.cpp                 |   8 +-
 clang/lib/Sema/SemaOverload.cpp               | 255 ++++++++++--
 clang/lib/Sema/SemaTemplate.cpp               |   5 +
 clang/lib/Sema/SemaTemplateDeduction.cpp      |  24 +-
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |   2 +-
 .../lib/Sema/SemaTemplateInstantiateDecl.cpp  |  84 +++-
 clang/lib/Sema/SemaTemplateVariadic.cpp       |   4 +
 clang/lib/Sema/TreeTransform.h                |  10 +-
 clang/lib/Serialization/ASTReaderDecl.cpp     |   1 +
 clang/lib/Serialization/ASTWriterDecl.cpp     |   7 +-
 .../CXX/class.derived/class.virtual/p6.cpp    |  21 +
 clang/test/CXX/dcl/dcl.decl/p3.cpp            |  35 ++
 .../expr.prim.id/mixed-constraints.cpp        |  18 +
 .../CXX/expr/expr.prim/expr.prim.id/p4.cpp    |  58 +++
 .../expr.prim.lambda.closure/p3.cpp           |  20 +
 .../over/over.match/over.match.best/p1-2a.cpp | 114 ++++++
 .../over/over.match/over.match.viable/p3.cpp  |  63 +++
 clang/test/CXX/over/over.over/p4-2a.cpp       |  61 +++
 .../temp.constr.constr/function-templates.cpp |   5 +-
 .../non-function-templates.cpp                |  16 +-
 .../partial-specializations.cpp               |  12 +-
 .../temp.constr.decl/class-template-decl.cpp  |  12 +-
 .../temp.constr.decl/func-template-decl.cpp   |  10 +-
 .../temp.constr.decl/var-template-decl.cpp    |   8 +-
 ...class-template-partial-specializations.cpp |  21 +-
 .../temp.constr.order/function-templates.cpp  |  10 +-
 .../var-template-partial-specializations.cpp  |   8 +-
 clang/test/CXX/temp/temp.explicit/p8.cpp      |  22 ++
 .../cxx-concepts-ambig-constraint-expr.cpp    |  24 +-
 .../Parser/cxx-concepts-requires-clause.cpp   |  99 ++++-
 .../instantiate-requires-clause.cpp           |  31 ++
 61 files changed, 2109 insertions(+), 469 deletions(-)
 create mode 100644 clang/include/clang/Sema/SemaConcept.h
 create mode 100644 clang/test/CXX/class.derived/class.virtual/p6.cpp
 create mode 100644 clang/test/CXX/dcl/dcl.decl/p3.cpp
 create mode 100644 clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp
 create mode 100644 clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp
 create mode 100644 clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp
 create mode 100644 clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
 create mode 100644 clang/test/CXX/over/over.match/over.match.viable/p3.cpp
 create mode 100644 clang/test/CXX/over/over.over/p4-2a.cpp
 create mode 100644 clang/test/CXX/temp/temp.explicit/p8.cpp
 create mode 100644 clang/test/SemaTemplate/instantiate-requires-clause.cpp

diff --git a/clang/include/clang/AST/ASTLambda.h b/clang/include/clang/AST/ASTLambda.h
index c1153168e41bb..6fd82d6af4908 100644
--- a/clang/include/clang/AST/ASTLambda.h
+++ b/clang/include/clang/AST/ASTLambda.h
@@ -64,6 +64,17 @@ inline bool isGenericLambdaCallOperatorSpecialization(DeclContext *DC) {
                                           dyn_cast<CXXMethodDecl>(DC));
 }
 
+inline bool isGenericLambdaCallOperatorOrStaticInvokerSpecialization(
+    DeclContext *DC) {
+  CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(DC);
+  if (!MD) return false;
+  const CXXRecordDecl *LambdaClass = MD->getParent();
+  if (LambdaClass && LambdaClass->isGenericLambda())
+    return (isLambdaCallOperator(MD) || MD->isLambdaStaticInvoker()) &&
+                    MD->isFunctionTemplateSpecialization();
+  return false;
+}
+
 
 // This returns the parent DeclContext ensuring that the correct
 // parent DeclContext is returned for Lambdas
diff --git a/clang/include/clang/AST/ASTNodeTraverser.h b/clang/include/clang/AST/ASTNodeTraverser.h
index 9dab814b659ba..a672d92695da9 100644
--- a/clang/include/clang/AST/ASTNodeTraverser.h
+++ b/clang/include/clang/AST/ASTNodeTraverser.h
@@ -384,6 +384,9 @@ class ASTNodeTraverser
       for (const auto *Parameter : D->parameters())
         Visit(Parameter);
 
+    if (const Expr *TRC = D->getTrailingRequiresClause())
+      Visit(TRC);
+
     if (const auto *C = dyn_cast<CXXConstructorDecl>(D))
       for (const auto *I : C->inits())
         Visit(I);
diff --git a/clang/include/clang/AST/Decl.h b/clang/include/clang/AST/Decl.h
index cd97c6dcf8d5c..002d1434b1cbd 100644
--- a/clang/include/clang/AST/Decl.h
+++ b/clang/include/clang/AST/Decl.h
@@ -669,10 +669,12 @@ struct QualifierInfo {
 /// Represents a ValueDecl that came out of a declarator.
 /// Contains type source information through TypeSourceInfo.
 class DeclaratorDecl : public ValueDecl {
-  // A struct representing both a TInfo and a syntactic qualifier,
-  // to be used for the (uncommon) case of out-of-line declarations.
+  // A struct representing a TInfo, a trailing requires-clause and a syntactic
+  // qualifier, to be used for the (uncommon) case of out-of-line declarations
+  // and constrained function decls.
   struct ExtInfo : public QualifierInfo {
     TypeSourceInfo *TInfo;
+    Expr *TrailingRequiresClause = nullptr;
   };
 
   llvm::PointerUnion<TypeSourceInfo *, ExtInfo *> DeclInfo;
@@ -739,6 +741,21 @@ class DeclaratorDecl : public ValueDecl {
 
   void setQualifierInfo(NestedNameSpecifierLoc QualifierLoc);
 
+  /// \brief Get the constraint-expression introduced by the trailing
+  /// requires-clause in the function/member declaration, or null if no
+  /// requires-clause was provided.
+  Expr *getTrailingRequiresClause() {
+    return hasExtInfo() ? getExtInfo()->TrailingRequiresClause
+                        : nullptr;
+  }
+
+  const Expr *getTrailingRequiresClause() const {
+    return hasExtInfo() ? getExtInfo()->TrailingRequiresClause
+                        : nullptr;
+  }
+
+  void setTrailingRequiresClause(Expr *TrailingRequiresClause);
+
   unsigned getNumTemplateParameterLists() const {
     return hasExtInfo() ? getExtInfo()->NumTemplParamLists : 0;
   }
@@ -1903,7 +1920,8 @@ class FunctionDecl : public DeclaratorDecl,
   FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC, SourceLocation StartLoc,
                const DeclarationNameInfo &NameInfo, QualType T,
                TypeSourceInfo *TInfo, StorageClass S, bool isInlineSpecified,
-               ConstexprSpecKind ConstexprKind);
+               ConstexprSpecKind ConstexprKind,
+               Expr *TrailingRequiresClause = nullptr);
 
   using redeclarable_base = Redeclarable<FunctionDecl>;
 
@@ -1938,11 +1956,12 @@ class FunctionDecl : public DeclaratorDecl,
          SourceLocation NLoc, DeclarationName N, QualType T,
          TypeSourceInfo *TInfo, StorageClass SC, bool isInlineSpecified = false,
          bool hasWrittenPrototype = true,
-         ConstexprSpecKind ConstexprKind = CSK_unspecified) {
+         ConstexprSpecKind ConstexprKind = CSK_unspecified,
+         Expr *TrailingRequiresClause = nullptr) {
     DeclarationNameInfo NameInfo(N, NLoc);
     return FunctionDecl::Create(C, DC, StartLoc, NameInfo, T, TInfo, SC,
                                 isInlineSpecified, hasWrittenPrototype,
-                                ConstexprKind);
+                                ConstexprKind, TrailingRequiresClause);
   }
 
   static FunctionDecl *Create(ASTContext &C, DeclContext *DC,
@@ -1950,7 +1969,8 @@ class FunctionDecl : public DeclaratorDecl,
                               const DeclarationNameInfo &NameInfo, QualType T,
                               TypeSourceInfo *TInfo, StorageClass SC,
                               bool isInlineSpecified, bool hasWrittenPrototype,
-                              ConstexprSpecKind ConstexprKind);
+                              ConstexprSpecKind ConstexprKind,
+                              Expr *TrailingRequiresClause);
 
   static FunctionDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
@@ -2352,6 +2372,17 @@ class FunctionDecl : public DeclaratorDecl,
   /// the target functionality.
   bool isTargetMultiVersion() const;
 
+  /// \brief Get the associated-constraints of this function declaration.
+  /// Currently, this will either be a vector of size 1 containing the
+  /// trailing-requires-clause or an empty vector.
+  ///
+  /// Use this instead of getTrailingRequiresClause for concepts APIs that
+  /// accept an ArrayRef of constraint expressions.
+  void getAssociatedConstraints(SmallVectorImpl<const Expr *> &AC) const {
+    if (auto *TRC = getTrailingRequiresClause())
+      AC.push_back(TRC);
+  }
+
   void setPreviousDeclaration(FunctionDecl * PrevDecl);
 
   FunctionDecl *getCanonicalDecl() override;
diff --git a/clang/include/clang/AST/DeclCXX.h b/clang/include/clang/AST/DeclCXX.h
index 1c6f99438fc3e..aba33e383976c 100644
--- a/clang/include/clang/AST/DeclCXX.h
+++ b/clang/include/clang/AST/DeclCXX.h
@@ -1905,9 +1905,10 @@ class CXXMethodDecl : public FunctionDecl {
                 SourceLocation StartLoc, const DeclarationNameInfo &NameInfo,
                 QualType T, TypeSourceInfo *TInfo, StorageClass SC,
                 bool isInline, ConstexprSpecKind ConstexprKind,
-                SourceLocation EndLocation)
+                SourceLocation EndLocation,
+                Expr *TrailingRequiresClause = nullptr)
       : FunctionDecl(DK, C, RD, StartLoc, NameInfo, T, TInfo, SC, isInline,
-                     ConstexprKind) {
+                     ConstexprKind, TrailingRequiresClause) {
     if (EndLocation.isValid())
       setRangeEnd(EndLocation);
   }
@@ -1918,7 +1919,8 @@ class CXXMethodDecl : public FunctionDecl {
                                const DeclarationNameInfo &NameInfo, QualType T,
                                TypeSourceInfo *TInfo, StorageClass SC,
                                bool isInline, ConstexprSpecKind ConstexprKind,
-                               SourceLocation EndLocation);
+                               SourceLocation EndLocation,
+                               Expr *TrailingRequiresClause = nullptr);
 
   static CXXMethodDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
@@ -2363,7 +2365,8 @@ class CXXConstructorDecl final
                      const DeclarationNameInfo &NameInfo, QualType T,
                      TypeSourceInfo *TInfo, ExplicitSpecifier ES, bool isInline,
                      bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
-                     InheritedConstructor Inherited);
+                     InheritedConstructor Inherited,
+                     Expr *TrailingRequiresClause);
 
   void anchor() override;
 
@@ -2416,7 +2419,8 @@ class CXXConstructorDecl final
          const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
          ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
          ConstexprSpecKind ConstexprKind,
-         InheritedConstructor Inherited = InheritedConstructor());
+         InheritedConstructor Inherited = InheritedConstructor(),
+         Expr *TrailingRequiresClause = nullptr);
 
   ExplicitSpecifier getExplicitSpecifier() {
     return getCanonicalDecl()->getExplicitSpecifierInternal();
@@ -2623,9 +2627,11 @@ class CXXDestructorDecl : public CXXMethodDecl {
   CXXDestructorDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
                     const DeclarationNameInfo &NameInfo, QualType T,
                     TypeSourceInfo *TInfo, bool isInline,
-                    bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind)
+                    bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
+                    Expr *TrailingRequiresClause = nullptr)
       : CXXMethodDecl(CXXDestructor, C, RD, StartLoc, NameInfo, T, TInfo,
-                      SC_None, isInline, ConstexprKind, SourceLocation()) {
+                      SC_None, isInline, ConstexprKind, SourceLocation(),
+                      TrailingRequiresClause) {
     setImplicit(isImplicitlyDeclared);
   }
 
@@ -2637,7 +2643,8 @@ class CXXDestructorDecl : public CXXMethodDecl {
                                    const DeclarationNameInfo &NameInfo,
                                    QualType T, TypeSourceInfo *TInfo,
                                    bool isInline, bool isImplicitlyDeclared,
-                                   ConstexprSpecKind ConstexprKind);
+                                   ConstexprSpecKind ConstexprKind,
+                                   Expr *TrailingRequiresClause = nullptr);
   static CXXDestructorDecl *CreateDeserialized(ASTContext & C, unsigned ID);
 
   void setOperatorDelete(FunctionDecl *OD, Expr *ThisArg);
@@ -2676,9 +2683,11 @@ class CXXConversionDecl : public CXXMethodDecl {
   CXXConversionDecl(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
                     const DeclarationNameInfo &NameInfo, QualType T,
                     TypeSourceInfo *TInfo, bool isInline, ExplicitSpecifier ES,
-                    ConstexprSpecKind ConstexprKind, SourceLocation EndLocation)
+                    ConstexprSpecKind ConstexprKind, SourceLocation EndLocation,
+                    Expr *TrailingRequiresClause = nullptr)
       : CXXMethodDecl(CXXConversion, C, RD, StartLoc, NameInfo, T, TInfo,
-                      SC_None, isInline, ConstexprKind, EndLocation),
+                      SC_None, isInline, ConstexprKind, EndLocation,
+                      TrailingRequiresClause),
         ExplicitSpec(ES) {}
   void anchor() override;
 
@@ -2694,7 +2703,7 @@ class CXXConversionDecl : public CXXMethodDecl {
   Create(ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
          const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
          bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind,
-         SourceLocation EndLocation);
+         SourceLocation EndLocation, Expr *TrailingRequiresClause = nullptr);
   static CXXConversionDecl *CreateDeserialized(ASTContext &C, unsigned ID);
 
   ExplicitSpecifier getExplicitSpecifier() {
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 19dd62b0fe0fd..144ef221d6920 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -2030,6 +2030,11 @@ bool RecursiveASTVisitor<Derived>::TraverseFunctionHelper(FunctionDecl *D) {
     }
   }
 
+  // Visit the trailing requires clause, if any.
+  if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) {
+    TRY_TO(TraverseStmt(TrailingRequiresClause));
+  }
+
   if (CXXConstructorDecl *Ctor = dyn_cast<CXXConstructorDecl>(D)) {
     // Constructor initializers.
     for (auto *I : Ctor->inits()) {
diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td
index 5f3821fcecb6c..cc6a74ac3e6dd 100644
--- a/clang/include/clang/Basic/DiagnosticParseKinds.td
+++ b/clang/include/clang/Basic/DiagnosticParseKinds.td
@@ -181,6 +181,13 @@ def err_function_declared_typedef : Error<
 def err_at_defs_cxx : Error<"@defs is not supported in Objective-C++">;
 def err_at_in_class : Error<"unexpected '@' in member specification">;
 def err_unexpected_semi : Error<"unexpected ';' before %0">;
+def err_unparenthesized_non_primary_expr_in_requires_clause : Error<
+  "parentheses are required around this expression in a requires clause">;
+def note_unparenthesized_non_primary_expr_in_requires_clause : Note<
+  "parentheses are required around this expression in a requires clause">;
+def err_potential_function_call_in_constraint_logical_or : Error<
+  "function call must be parenthesized to be considered part of the requires "
+  "clause">;
 
 def err_expected_fn_body : Error<
   "expected function body after function declarator">;
@@ -309,6 +316,12 @@ def err_init_list_bin_op : Error<"initializer list cannot be used on the "
 def warn_cxx98_compat_trailing_return_type : Warning<
   "trailing return types are incompatible with C++98">,
   InGroup<CXX98Compat>, DefaultIgnore;
+def err_requires_clause_must_appear_after_trailing_return : Error<
+  "trailing return type must appear before trailing requires clause">;
+def err_requires_clause_on_declarator_not_declaring_a_function : Error<
+  "trailing requires clause can only be used when declaring a function">;
+def err_requires_clause_inside_parens : Error<
+  "trailing requires clause should be placed outside parentheses">;
 def ext_auto_storage_class : ExtWarn<
   "'auto' storage class specifier is not permitted in C++11, and will not "
   "be supported in future releases">, InGroup<DiagGroup<"auto-storage-class">>;
@@ -880,7 +893,7 @@ def warn_cxx98_compat_lambda : Warning<
   InGroup<CXX98Compat>, DefaultIgnore;
 def err_lambda_missing_parens : Error<
   "lambda requires '()' before %select{'mutable'|return type|"
-  "attribute specifier|'constexpr'|'consteval'}0">;
+  "attribute specifier|'constexpr'|'consteval'|'requires' clause}0">;
 def err_lambda_decl_specifier_repeated : Error<
   "%select{'mutable'|'constexpr'|'consteval'}0 cannot appear multiple times in a lambda declarator">;
 def err_lambda_capture_misplaced_ellipsis : Error<
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dd9649bcb5c31..545c1bdf6e74f 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2596,6 +2596,18 @@ def note_single_arg_concept_specialization_constraint_evaluated_to_false : Note<
   "%select{and |because }0%1 does not satisfy %2">;
 def note_atomic_constraint_evaluated_to_false_elaborated : Note<
   "%select{and |because }0'%1' (%2 %3 %4) evaluated to false">;
+def err_constrained_virtual_method : Error<
+  "virtual function cannot have a requires clause">;
+def err_trailing_requires_clause_on_deduction_guide : Error<
+  "deduction guide cannot have a requires clause">;
+def err_reference_to_function_with_unsatisfied_constraints : Error<
+  "invalid reference to function %0: constraints not satisfied">;
+def note_ambiguous_atomic_constraints : Note<
+  "similar constraint expressions not considered equivalent; constraint "
+  "expressions cannot be considered equivalent unless they originate from the "
+  "same concept">;
+def note_ambiguous_atomic_constraints_similar_expression : Note<
+  "similar constraint expression here">;
 
 def err_template_different_requires_clause : Error<
   "requires clause differs in template redeclaration">;
@@ -3920,6 +3932,9 @@ def note_ovl_candidate_disabled_by_extension : Note<
 def err_addrof_function_disabled_by_enable_if_attr : Error<
     "cannot take address of function %0 because it has one or more "
     "non-tautological enable_if conditions">;
+def err_addrof_function_constraints_not_satisfied : Error<
+    "cannot take address of function %0 because its constraints are not "
+    "satisfied">;
 def note_addrof_ovl_candidate_disabled_by_enable_if_attr : Note<
     "candidate function made ineligible by enable_if">;
 def note_ovl_candidate_deduced_mismatch : Note<
@@ -4033,6 +4048,9 @@ def note_ovl_candidate_bad_target : Note<
     "call to "
     "%select{__device__|__global__|__host__|__host__ __device__|invalid}3 function from"
     " %select{__device__|__global__|__host__|__host__ __device__|invalid}4 function">;
+def note_ovl_candidate_constraints_not_satisfied : Note<
+    "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: constraints "
+    "not satisfied">;
 def note_implicit_member_target_infer_collision : Note<
     "implicit %sub{select_special_member_kind}0 inferred target collision: call to both "
     "%select{__device__|__global__|__host__|__host__ __device__}1 and "
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index d41bbd86561c1..eb2f4152e72c6 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -1679,6 +1679,9 @@ class Parser : public CodeCompletionHandler {
   ExprResult ParseConstantExpression(TypeCastState isTypeCast = NotTypeCast);
   ExprResult ParseCaseExpression(SourceLocation CaseLoc);
   ExprResult ParseConstraintExpression();
+  ExprResult
+  ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause);
+  ExprResult ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause);
   // Expr that doesn't include commas.
   ExprResult ParseAssignmentExpression(TypeCastState isTypeCast = NotTypeCast);
 
@@ -1693,15 +1696,23 @@ class Parser : public CodeCompletionHandler {
 
   ExprResult ParseRHSOfBinaryExpression(ExprResult LHS,
                                         prec::Level MinPrec);
-  ExprResult ParseCastExpression(bool isUnaryExpression,
+  /// Control what ParseCastExpression will parse.
+  enum CastParseKind {
+    AnyCastExpr = 0,
+    UnaryExprOnly,
+    PrimaryExprOnly
+  };
+  ExprResult ParseCastExpression(CastParseKind ParseKind,
                                  bool isAddressOfOperand,
                                  bool &NotCastExpr,
                                  TypeCastState isTypeCast,
-                                 bool isVectorLiteral = false);
-  ExprResult ParseCastExpression(bool isUnaryExpression,
+                                 bool isVectorLiteral = false,
+                                 bool *NotPrimaryExpression = nullptr);
+  ExprResult ParseCastExpression(CastParseKind ParseKind,
                                  bool isAddressOfOperand = false,
                                  TypeCastState isTypeCast = NotTypeCast,
-                                 bool isVectorLiteral = false);
+                                 bool isVectorLiteral = false,
+                                 bool *NotPrimaryExpression = nullptr);
 
   /// Returns true if the next token cannot start an expression.
   bool isNotExpressionStart();
@@ -1909,6 +1920,11 @@ class Parser : public CodeCompletionHandler {
 
   ExprResult ParseCoyieldExpression();
 
+  //===--------------------------------------------------------------------===//
+  // C++ Concepts
+
+  void ParseTrailingRequiresClause(Declarator &D);
+
   //===--------------------------------------------------------------------===//
   // C99 6.7.8: Initialization.
 
@@ -2730,6 +2746,9 @@ class Parser : public CodeCompletionHandler {
                                BalancedDelimiterTracker &Tracker,
                                bool IsAmbiguous,
                                bool RequiresArg = false);
+  void InitCXXThisScopeForDeclaratorIfRelevant(
+      const Declarator &D, const DeclSpec &DS,
+      llvm::Optional<Sema::CXXThisScopeRAII> &ThisScope);
   bool ParseRefQualifier(bool &RefQualifierIsLValueRef,
                          SourceLocation &RefQualifierLoc);
   bool isFunctionDeclaratorIdentifierList();
@@ -2837,10 +2856,11 @@ class Parser : public CodeCompletionHandler {
                                    Decl *TagDecl);
   ExprResult ParseCXXMemberInitializer(Decl *D, bool IsFunction,
                                        SourceLocation &EqualLoc);
-  bool ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo,
-                                                 VirtSpecifiers &VS,
-                                                 ExprResult &BitfieldSize,
-                                                 LateParsedAttrList &LateAttrs);
+  bool
+  ParseCXXMemberDeclaratorBeforeInitializer(Declarator &DeclaratorInfo,
+                                            VirtSpecifiers &VS,
+                                            ExprResult &BitfieldSize,
+                                            LateParsedAttrList &LateAttrs);
   void MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(Declarator &D,
                                                                VirtSpecifiers &VS);
   DeclGroupPtrTy ParseCXXClassMemberDeclaration(
diff --git a/clang/include/clang/Sema/DeclSpec.h b/clang/include/clang/Sema/DeclSpec.h
index e3ead60bb43f6..aceec9cbe1c9e 100644
--- a/clang/include/clang/Sema/DeclSpec.h
+++ b/clang/include/clang/Sema/DeclSpec.h
@@ -1826,6 +1826,10 @@ class Declarator {
   /// The asm label, if specified.
   Expr *AsmLabel;
 
+  /// \brief The constraint-expression specified by the trailing
+  /// requires-clause, or null if no such clause was specified.
+  Expr *TrailingRequiresClause;
+
 #ifndef _MSC_VER
   union {
 #endif
@@ -1855,7 +1859,8 @@ class Declarator {
         GroupingParens(false), FunctionDefinition(FDK_Declaration),
         Redeclaration(false), Extension(false), ObjCIvar(false),
         ObjCWeakProperty(false), InlineStorageUsed(false),
-        Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr) {}
+        Attrs(ds.getAttributePool().getFactory()), AsmLabel(nullptr),
+        TrailingRequiresClause(nullptr) {}
 
   ~Declarator() {
     clear();
@@ -2401,6 +2406,22 @@ class Declarator {
     return false;
   }
 
+  /// \brief Sets a trailing requires clause for this declarator.
+  void setTrailingRequiresClause(Expr *TRC) {
+    TrailingRequiresClause = TRC;
+  }
+
+  /// \brief Sets a trailing requires clause for this declarator.
+  Expr *getTrailingRequiresClause() {
+    return TrailingRequiresClause;
+  }
+
+  /// \brief Determine whether a trailing requires clause was written in this
+  /// declarator.
+  bool hasTrailingRequiresClause() const {
+    return TrailingRequiresClause != nullptr;
+  }
+
   /// takeAttributes - Takes attributes from the given parsed-attributes
   /// set and add them to this declarator.
   ///
diff --git a/clang/include/clang/Sema/Overload.h b/clang/include/clang/Sema/Overload.h
index e0c3ba13ef543..0ccb658c6a771 100644
--- a/clang/include/clang/Sema/Overload.h
+++ b/clang/include/clang/Sema/Overload.h
@@ -754,7 +754,11 @@ class Sema;
     /// This constructor/conversion candidate fail due to an address space
     /// mismatch between the object being constructed and the overload
     /// candidate.
-    ovl_fail_object_addrspace_mismatch
+    ovl_fail_object_addrspace_mismatch,
+
+    /// This candidate was not viable because its associated constraints were
+    /// not satisfied.
+    ovl_fail_constraints_not_satisfied,
   };
 
   /// A list of implicit conversion sequences for the arguments of an
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 2d7aa9462db1e..9a1fd70b88979 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -47,6 +47,7 @@
 #include "clang/Sema/ObjCMethodList.h"
 #include "clang/Sema/Ownership.h"
 #include "clang/Sema/Scope.h"
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/TypoCorrection.h"
 #include "clang/Sema/Weak.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -2300,6 +2301,8 @@ class Sema final {
                                 SkipBodyInfo *SkipBody = nullptr);
   Decl *ActOnStartOfFunctionDef(Scope *S, Decl *D,
                                 SkipBodyInfo *SkipBody = nullptr);
+  void ActOnStartTrailingRequiresClause(Scope *S, Declarator &D);
+  ExprResult ActOnFinishTrailingRequiresClause(ExprResult ConstraintExpr);
   void ActOnStartOfObjCMethodDef(Scope *S, Decl *D);
   bool isObjCMethodDecl(Decl *D) {
     return D && isa<ObjCMethodDecl>(D);
@@ -2884,7 +2887,8 @@ class Sema final {
                              NamedDecl *&OldDecl,
                              bool IsForUsingDecl);
   bool IsOverload(FunctionDecl *New, FunctionDecl *Old, bool IsForUsingDecl,
-                  bool ConsiderCudaAttrs = true);
+                  bool ConsiderCudaAttrs = true,
+                  bool ConsiderRequiresClauses = true);
 
   ImplicitConversionSequence
   TryImplicitConversion(Expr *From, QualType ToType,
@@ -3249,10 +3253,9 @@ class Sema final {
                                      bool *pHadMultipleCandidates = nullptr);
 
   FunctionDecl *
-  resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
-                                              DeclAccessPair &FoundResult);
+  resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &FoundResult);
 
-  bool resolveAndFixAddressOfOnlyViableOverloadCandidate(
+  bool resolveAndFixAddressOfSingleOverloadCandidate(
       ExprResult &SrcExpr, bool DoFunctionPointerConversion = false);
 
   FunctionDecl *
@@ -6035,7 +6038,8 @@ class Sema final {
                                        TypeSourceInfo *MethodType,
                                        SourceLocation EndLoc,
                                        ArrayRef<ParmVarDecl *> Params,
-                                       ConstexprSpecKind ConstexprKind);
+                                       ConstexprSpecKind ConstexprKind,
+                                       Expr *TrailingRequiresClause);
 
   /// Number lambda for linkage purposes if necessary.
   void handleLambdaNumbering(
@@ -6169,16 +6173,35 @@ class Sema final {
                                            Expr *Src);
 
   /// Check whether the given expression is a valid constraint expression.
-  /// A diagnostic is emitted if it is not, and false is returned.
-  bool CheckConstraintExpression(Expr *CE);
+  /// A diagnostic is emitted if it is not, false is returned, and
+  /// PossibleNonPrimary will be set to true if the failure might be due to a
+  /// non-primary expression being used as an atomic constraint.
+  bool CheckConstraintExpression(Expr *CE, Token NextToken = Token(),
+                                 bool *PossibleNonPrimary = nullptr,
+                                 bool IsTrailingRequiresClause = false);
+
+  /// Check whether the given type-dependent expression will be the name of a
+  /// function or another callable function-like entity (e.g. a function
+  // template or overload set) for any substitution.
+  bool IsDependentFunctionNameExpr(Expr *E);
 
 private:
-  /// \brief Caches pairs of template-like decls whose associated constraints
-  /// were checked for subsumption and whether or not the first's constraints
-  /// did in fact subsume the second's.
+  /// Caches pairs of template-like decls whose associated constraints were
+  /// checked for subsumption and whether or not the first's constraints did in
+  /// fact subsume the second's.
   llvm::DenseMap<std::pair<NamedDecl *, NamedDecl *>, bool> SubsumptionCache;
+  /// Caches the normalized associated constraints of declarations (concepts or
+  /// constrained declarations). If an error occurred while normalizing the
+  /// associated constraints of the template or concept, nullptr will be cached
+  /// here.
+  llvm::DenseMap<NamedDecl *, NormalizedConstraint *>
+      NormalizationCache;
 
 public:
+  const NormalizedConstraint *
+  getNormalizedAssociatedConstraints(
+      NamedDecl *ConstrainedDecl, ArrayRef<const Expr *> AssociatedConstraints);
+
   /// \brief Check whether the given declaration's associated constraints are
   /// at least as constrained than another declaration's according to the
   /// partial ordering of constraints.
@@ -6191,6 +6214,13 @@ class Sema final {
                               NamedDecl *D2, ArrayRef<const Expr *> AC2,
                               bool &Result);
 
+  /// If D1 was not at least as constrained as D2, but would've been if a pair
+  /// of atomic constraints involved had been declared in a concept and not
+  /// repeated in two separate places in code.
+  /// \returns true if such a diagnostic was emitted, false otherwise.
+  bool MaybeEmitAmbiguousAtomicConstraintsDiagnostic(NamedDecl *D1,
+      ArrayRef<const Expr *> AC1, NamedDecl *D2, ArrayRef<const Expr *> AC2);
+
   /// \brief Check whether the given list of constraint expressions are
   /// satisfied (as if in a 'conjunction') given template arguments.
   /// \param ConstraintExprs a list of constraint expressions, treated as if
@@ -8644,6 +8674,10 @@ class Sema final {
 
   void InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
                                 FunctionDecl *Function);
+  bool CheckInstantiatedFunctionTemplateConstraints(
+      SourceLocation PointOfInstantiation, FunctionDecl *Decl,
+      ArrayRef<TemplateArgument> TemplateArgs,
+      ConstraintSatisfaction &Satisfaction);
   FunctionDecl *InstantiateFunctionDeclaration(FunctionTemplateDecl *FTD,
                                                const TemplateArgumentList *Args,
                                                SourceLocation Loc);
diff --git a/clang/include/clang/Sema/SemaConcept.h b/clang/include/clang/Sema/SemaConcept.h
new file mode 100644
index 0000000000000..acd1e604211a1
--- /dev/null
+++ b/clang/include/clang/Sema/SemaConcept.h
@@ -0,0 +1,145 @@
+//===-- SemaConcept.h - Semantic Analysis for Constraints and Concepts ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+//  This file provides semantic analysis for C++ constraints and concepts.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_SEMA_SEMACONCEPT_H
+#define LLVM_CLANG_SEMA_SEMACONCEPT_H
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Expr.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
+namespace clang {
+class Sema;
+
+struct AtomicConstraint {
+  const Expr *ConstraintExpr;
+  Optional<MutableArrayRef<TemplateArgumentLoc>> ParameterMapping;
+
+  AtomicConstraint(Sema &S, const Expr *ConstraintExpr) :
+      ConstraintExpr(ConstraintExpr) { };
+
+  bool hasMatchingParameterMapping(ASTContext &C,
+                                   const AtomicConstraint &Other) const {
+    if (!ParameterMapping != !Other.ParameterMapping)
+      return false;
+    if (!ParameterMapping)
+      return true;
+    if (ParameterMapping->size() != Other.ParameterMapping->size())
+      return false;
+
+    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I)
+      if (!C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
+               .structurallyEquals(C.getCanonicalTemplateArgument(
+                  (*Other.ParameterMapping)[I].getArgument())))
+        return false;
+    return true;
+  }
+
+  bool subsumes(ASTContext &C, const AtomicConstraint &Other) const {
+    // C++ [temp.constr.order] p2
+    //   - an atomic constraint A subsumes another atomic constraint B
+    //     if and only if the A and B are identical [...]
+    //
+    // C++ [temp.constr.atomic] p2
+    //   Two atomic constraints are identical if they are formed from the
+    //   same expression and the targets of the parameter mappings are
+    //   equivalent according to the rules for expressions [...]
+
+    // We do not actually substitute the parameter mappings into the
+    // constraint expressions, therefore the constraint expressions are
+    // the originals, and comparing them will suffice.
+    if (ConstraintExpr != Other.ConstraintExpr)
+      return false;
+
+    // Check that the parameter lists are identical
+    return hasMatchingParameterMapping(C, Other);
+  }
+};
+
+/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
+/// either an atomic constraint, a conjunction of normalized constraints or a
+/// disjunction of normalized constraints.
+struct NormalizedConstraint {
+  friend class Sema;
+
+  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
+
+  using CompoundConstraint = llvm::PointerIntPair<
+      std::pair<NormalizedConstraint, NormalizedConstraint> *, 1,
+      CompoundConstraintKind>;
+
+  llvm::PointerUnion<AtomicConstraint *, CompoundConstraint> Constraint;
+
+  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
+  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
+                       NormalizedConstraint RHS, CompoundConstraintKind Kind)
+      : Constraint{CompoundConstraint{
+            new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{
+                std::move(LHS), std::move(RHS)}, Kind}} { };
+
+  NormalizedConstraint(ASTContext &C, const NormalizedConstraint &Other) {
+    if (Other.isAtomic()) {
+      Constraint = new (C) AtomicConstraint(*Other.getAtomicConstraint());
+    } else {
+      Constraint = CompoundConstraint(
+          new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{
+              NormalizedConstraint(C, Other.getLHS()),
+              NormalizedConstraint(C, Other.getRHS())},
+              Other.getCompoundKind());
+    }
+  }
+  NormalizedConstraint(NormalizedConstraint &&Other):
+      Constraint(Other.Constraint) {
+    Other.Constraint = nullptr;
+  }
+  NormalizedConstraint &operator=(const NormalizedConstraint &Other) = delete;
+  NormalizedConstraint &operator=(NormalizedConstraint &&Other) {
+    if (&Other != this) {
+      NormalizedConstraint Temp(std::move(Other));
+      std::swap(Constraint, Temp.Constraint);
+    }
+    return *this;
+  }
+
+  CompoundConstraintKind getCompoundKind() const {
+    assert(!isAtomic() && "getCompoundKind called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getInt();
+  }
+
+  bool isAtomic() const { return Constraint.is<AtomicConstraint *>(); }
+
+  NormalizedConstraint &getLHS() const {
+    assert(!isAtomic() && "getLHS called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getPointer()->first;
+  }
+
+  NormalizedConstraint &getRHS() const {
+    assert(!isAtomic() && "getRHS called on atomic constraint.");
+    return Constraint.get<CompoundConstraint>().getPointer()->second;
+  }
+
+  AtomicConstraint *getAtomicConstraint() const {
+    assert(isAtomic() &&
+           "getAtomicConstraint called on non-atomic constraint.");
+    return Constraint.get<AtomicConstraint *>();
+  }
+
+private:
+  static Optional<NormalizedConstraint>
+  fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef<const Expr *> E);
+  static Optional<NormalizedConstraint>
+  fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E);
+};
+
+} // clang
+
+#endif //LLVM_CLANG_SEMA_SEMACONCEPT_H
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 567d2bf7d228d..f6c3aa1a3c1d2 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -3279,10 +3279,12 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
   TypeSourceInfo *TInfo;
   SourceLocation ToInnerLocStart, ToEndLoc;
   NestedNameSpecifierLoc ToQualifierLoc;
+  Expr *TrailingRequiresClause;
   if (auto Imp = importSeq(
       FromTy, D->getTypeSourceInfo(), D->getInnerLocStart(),
-      D->getQualifierLoc(), D->getEndLoc()))
-    std::tie(T, TInfo, ToInnerLocStart, ToQualifierLoc, ToEndLoc) = *Imp;
+      D->getQualifierLoc(), D->getEndLoc(), D->getTrailingRequiresClause()))
+    std::tie(T, TInfo, ToInnerLocStart, ToQualifierLoc, ToEndLoc,
+             TrailingRequiresClause) = *Imp;
   else
     return Imp.takeError();
 
@@ -3311,7 +3313,10 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
             ExplicitSpecifier(
                 ExplicitExpr,
                 FromConstructor->getExplicitSpecifier().getKind()),
-            D->isInlineSpecified(), D->isImplicit(), D->getConstexprKind()))
+            D->isInlineSpecified(), D->isImplicit(), D->getConstexprKind(),
+            InheritedConstructor(), // FIXME: Properly import inherited
+                                    // constructor info
+            TrailingRequiresClause))
       return ToFunction;
   } else if (CXXDestructorDecl *FromDtor = dyn_cast<CXXDestructorDecl>(D)) {
 
@@ -3329,7 +3334,7 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
     if (GetImportedOrCreateDecl<CXXDestructorDecl>(
         ToFunction, D, Importer.getToContext(), cast<CXXRecordDecl>(DC),
         ToInnerLocStart, NameInfo, T, TInfo, D->isInlineSpecified(),
-        D->isImplicit(), D->getConstexprKind()))
+        D->isImplicit(), D->getConstexprKind(), TrailingRequiresClause))
       return ToFunction;
 
     CXXDestructorDecl *ToDtor = cast<CXXDestructorDecl>(ToFunction);
@@ -3349,20 +3354,21 @@ ExpectedDecl ASTNodeImporter::VisitFunctionDecl(FunctionDecl *D) {
             ToInnerLocStart, NameInfo, T, TInfo, D->isInlineSpecified(),
             ExplicitSpecifier(ExplicitExpr,
                               FromConversion->getExplicitSpecifier().getKind()),
-            D->getConstexprKind(), SourceLocation()))
+            D->getConstexprKind(), SourceLocation(), TrailingRequiresClause))
       return ToFunction;
   } else if (auto *Method = dyn_cast<CXXMethodDecl>(D)) {
     if (GetImportedOrCreateDecl<CXXMethodDecl>(
             ToFunction, D, Importer.getToContext(), cast<CXXRecordDecl>(DC),
             ToInnerLocStart, NameInfo, T, TInfo, Method->getStorageClass(),
             Method->isInlineSpecified(), D->getConstexprKind(),
-            SourceLocation()))
+            SourceLocation(), TrailingRequiresClause))
       return ToFunction;
   } else {
     if (GetImportedOrCreateDecl(
             ToFunction, D, Importer.getToContext(), DC, ToInnerLocStart,
             NameInfo, T, TInfo, D->getStorageClass(), D->isInlineSpecified(),
-            D->hasWrittenPrototype(), D->getConstexprKind()))
+            D->hasWrittenPrototype(), D->getConstexprKind(),
+            TrailingRequiresClause))
       return ToFunction;
   }
 
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 6cfd4c2a2a218..e760fff2ff6ca 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -1839,21 +1839,25 @@ void DeclaratorDecl::setQualifierInfo(NestedNameSpecifierLoc QualifierLoc) {
     }
     // Set qualifier info.
     getExtInfo()->QualifierLoc = QualifierLoc;
-  } else {
+  } else if (hasExtInfo()) {
     // Here Qualifier == 0, i.e., we are removing the qualifier (if any).
-    if (hasExtInfo()) {
-      if (getExtInfo()->NumTemplParamLists == 0) {
-        // Save type source info pointer.
-        TypeSourceInfo *savedTInfo = getExtInfo()->TInfo;
-        // Deallocate the extended decl info.
-        getASTContext().Deallocate(getExtInfo());
-        // Restore savedTInfo into (non-extended) decl info.
-        DeclInfo = savedTInfo;
-      }
-      else
-        getExtInfo()->QualifierLoc = QualifierLoc;
-    }
+    getExtInfo()->QualifierLoc = QualifierLoc;
+  }
+}
+
+void DeclaratorDecl::setTrailingRequiresClause(Expr *TrailingRequiresClause) {
+  assert(TrailingRequiresClause);
+  // Make sure the extended decl info is allocated.
+  if (!hasExtInfo()) {
+    // Save (non-extended) type source info pointer.
+    auto *savedTInfo = DeclInfo.get<TypeSourceInfo*>();
+    // Allocate external info struct.
+    DeclInfo = new (getASTContext()) ExtInfo;
+    // Restore savedTInfo into (extended) decl info.
+    getExtInfo()->TInfo = savedTInfo;
   }
+  // Set requires clause info.
+  getExtInfo()->TrailingRequiresClause = TrailingRequiresClause;
 }
 
 void DeclaratorDecl::setTemplateParameterListsInfo(
@@ -2777,7 +2781,8 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
                            const DeclarationNameInfo &NameInfo, QualType T,
                            TypeSourceInfo *TInfo, StorageClass S,
                            bool isInlineSpecified,
-                           ConstexprSpecKind ConstexprKind)
+                           ConstexprSpecKind ConstexprKind,
+                           Expr *TrailingRequiresClause)
     : DeclaratorDecl(DK, DC, NameInfo.getLoc(), NameInfo.getName(), T, TInfo,
                      StartLoc),
       DeclContext(DK), redeclarable_base(C), Body(), ODRHash(0),
@@ -2807,6 +2812,8 @@ FunctionDecl::FunctionDecl(Kind DK, ASTContext &C, DeclContext *DC,
   FunctionDeclBits.IsMultiVersion = false;
   FunctionDeclBits.IsCopyDeductionCandidate = false;
   FunctionDeclBits.HasODRHash = false;
+  if (TrailingRequiresClause)
+    setTrailingRequiresClause(TrailingRequiresClause);
 }
 
 void FunctionDecl::getNameForDiagnostic(
@@ -4683,10 +4690,12 @@ FunctionDecl *FunctionDecl::Create(ASTContext &C, DeclContext *DC,
                                    QualType T, TypeSourceInfo *TInfo,
                                    StorageClass SC, bool isInlineSpecified,
                                    bool hasWrittenPrototype,
-                                   ConstexprSpecKind ConstexprKind) {
+                                   ConstexprSpecKind ConstexprKind,
+                                   Expr *TrailingRequiresClause) {
   FunctionDecl *New =
       new (C, DC) FunctionDecl(Function, C, DC, StartLoc, NameInfo, T, TInfo,
-                               SC, isInlineSpecified, ConstexprKind);
+                               SC, isInlineSpecified, ConstexprKind,
+                               TrailingRequiresClause);
   New->setHasWrittenPrototype(hasWrittenPrototype);
   return New;
 }
@@ -4694,7 +4703,7 @@ FunctionDecl *FunctionDecl::Create(ASTContext &C, DeclContext *DC,
 FunctionDecl *FunctionDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) FunctionDecl(Function, C, nullptr, SourceLocation(),
                                   DeclarationNameInfo(), QualType(), nullptr,
-                                  SC_None, false, CSK_unspecified);
+                                  SC_None, false, CSK_unspecified, nullptr);
 }
 
 BlockDecl *BlockDecl::Create(ASTContext &C, DeclContext *DC, SourceLocation L) {
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index caa60408b5b67..bc75c4e544d28 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -2041,16 +2041,19 @@ CXXMethodDecl *CXXMethodDecl::Create(ASTContext &C, CXXRecordDecl *RD,
                                      QualType T, TypeSourceInfo *TInfo,
                                      StorageClass SC, bool isInline,
                                      ConstexprSpecKind ConstexprKind,
-                                     SourceLocation EndLocation) {
+                                     SourceLocation EndLocation,
+                                     Expr *TrailingRequiresClause) {
   return new (C, RD)
       CXXMethodDecl(CXXMethod, C, RD, StartLoc, NameInfo, T, TInfo, SC,
-                    isInline, ConstexprKind, EndLocation);
+                    isInline, ConstexprKind, EndLocation,
+                    TrailingRequiresClause);
 }
 
 CXXMethodDecl *CXXMethodDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) CXXMethodDecl(
       CXXMethod, C, nullptr, SourceLocation(), DeclarationNameInfo(),
-      QualType(), nullptr, SC_None, false, CSK_unspecified, SourceLocation());
+      QualType(), nullptr, SC_None, false, CSK_unspecified, SourceLocation(),
+      nullptr);
 }
 
 CXXMethodDecl *CXXMethodDecl::getDevirtualizedMethod(const Expr *Base,
@@ -2431,9 +2434,11 @@ CXXConstructorDecl::CXXConstructorDecl(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
-    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited)
+    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited,
+    Expr *TrailingRequiresClause)
     : CXXMethodDecl(CXXConstructor, C, RD, StartLoc, NameInfo, T, TInfo,
-                    SC_None, isInline, ConstexprKind, SourceLocation()) {
+                    SC_None, isInline, ConstexprKind, SourceLocation(),
+                    TrailingRequiresClause) {
   setNumCtorInitializers(0);
   setInheritingConstructor(static_cast<bool>(Inherited));
   setImplicit(isImplicitlyDeclared);
@@ -2457,7 +2462,7 @@ CXXConstructorDecl *CXXConstructorDecl::CreateDeserialized(ASTContext &C,
   auto *Result = new (C, ID, Extra)
       CXXConstructorDecl(C, nullptr, SourceLocation(), DeclarationNameInfo(),
                          QualType(), nullptr, ExplicitSpecifier(), false, false,
-                         CSK_unspecified, InheritedConstructor());
+                         CSK_unspecified, InheritedConstructor(), nullptr);
   Result->setInheritingConstructor(isInheritingConstructor);
   Result->CXXConstructorDeclBits.HasTrailingExplicitSpecifier =
       hasTraillingExplicit;
@@ -2469,7 +2474,8 @@ CXXConstructorDecl *CXXConstructorDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     ExplicitSpecifier ES, bool isInline, bool isImplicitlyDeclared,
-    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited) {
+    ConstexprSpecKind ConstexprKind, InheritedConstructor Inherited,
+    Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXConstructorName &&
          "Name must refer to a constructor");
@@ -2478,7 +2484,8 @@ CXXConstructorDecl *CXXConstructorDecl::Create(
           Inherited ? 1 : 0, ES.getExpr() ? 1 : 0);
   return new (C, RD, Extra)
       CXXConstructorDecl(C, RD, StartLoc, NameInfo, T, TInfo, ES, isInline,
-                         isImplicitlyDeclared, ConstexprKind, Inherited);
+                         isImplicitlyDeclared, ConstexprKind, Inherited,
+                         TrailingRequiresClause);
 }
 
 CXXConstructorDecl::init_const_iterator CXXConstructorDecl::init_begin() const {
@@ -2599,19 +2606,22 @@ CXXDestructorDecl *
 CXXDestructorDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID)
       CXXDestructorDecl(C, nullptr, SourceLocation(), DeclarationNameInfo(),
-                        QualType(), nullptr, false, false, CSK_unspecified);
+                        QualType(), nullptr, false, false, CSK_unspecified,
+                        nullptr);
 }
 
 CXXDestructorDecl *CXXDestructorDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
-    bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind) {
+    bool isInline, bool isImplicitlyDeclared, ConstexprSpecKind ConstexprKind,
+    Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXDestructorName &&
          "Name must refer to a destructor");
   return new (C, RD)
       CXXDestructorDecl(C, RD, StartLoc, NameInfo, T, TInfo, isInline,
-                        isImplicitlyDeclared, ConstexprKind);
+                        isImplicitlyDeclared, ConstexprKind,
+                        TrailingRequiresClause);
 }
 
 void CXXDestructorDecl::setOperatorDelete(FunctionDecl *OD, Expr *ThisArg) {
@@ -2630,20 +2640,20 @@ CXXConversionDecl *
 CXXConversionDecl::CreateDeserialized(ASTContext &C, unsigned ID) {
   return new (C, ID) CXXConversionDecl(
       C, nullptr, SourceLocation(), DeclarationNameInfo(), QualType(), nullptr,
-      false, ExplicitSpecifier(), CSK_unspecified, SourceLocation());
+      false, ExplicitSpecifier(), CSK_unspecified, SourceLocation(), nullptr);
 }
 
 CXXConversionDecl *CXXConversionDecl::Create(
     ASTContext &C, CXXRecordDecl *RD, SourceLocation StartLoc,
     const DeclarationNameInfo &NameInfo, QualType T, TypeSourceInfo *TInfo,
     bool isInline, ExplicitSpecifier ES, ConstexprSpecKind ConstexprKind,
-    SourceLocation EndLocation) {
+    SourceLocation EndLocation, Expr *TrailingRequiresClause) {
   assert(NameInfo.getName().getNameKind()
          == DeclarationName::CXXConversionFunctionName &&
          "Name must refer to a conversion function");
   return new (C, RD)
       CXXConversionDecl(C, RD, StartLoc, NameInfo, T, TInfo, isInline, ES,
-                        ConstexprKind, EndLocation);
+                        ConstexprKind, EndLocation, TrailingRequiresClause);
 }
 
 bool CXXConversionDecl::isLambdaToBlockPointerConversion() const {
diff --git a/clang/lib/AST/DeclPrinter.cpp b/clang/lib/AST/DeclPrinter.cpp
index 6b17dd4dde759..68cc8b7a0eb9c 100644
--- a/clang/lib/AST/DeclPrinter.cpp
+++ b/clang/lib/AST/DeclPrinter.cpp
@@ -740,6 +740,11 @@ void DeclPrinter::VisitFunctionDecl(FunctionDecl *D) {
       Proto.clear();
     }
     Out << Proto;
+
+    if (Expr *TrailingRequiresClause = D->getTrailingRequiresClause()) {
+      Out << " requires ";
+      TrailingRequiresClause->printPretty(Out, nullptr, SubPolicy, Indentation);
+    }
   } else {
     Ty.print(Out, Policy, Proto);
   }
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index 23734396b7694..59fa7faad927d 100755
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -171,13 +171,18 @@ void TemplateDecl::anchor() {}
 
 void TemplateDecl::
 getAssociatedConstraints(llvm::SmallVectorImpl<const Expr *> &AC) const {
-  // TODO: Concepts: Append function trailing requires clause.
   TemplateParams->getAssociatedConstraints(AC);
+  if (auto *FD = dyn_cast_or_null<FunctionDecl>(getTemplatedDecl()))
+    if (const Expr *TRC = FD->getTrailingRequiresClause())
+      AC.push_back(TRC);
 }
 
 bool TemplateDecl::hasAssociatedConstraints() const {
-  // TODO: Concepts: Regard function trailing requires clause.
-  return TemplateParams->hasAssociatedConstraints();
+  if (TemplateParams->hasAssociatedConstraints())
+    return true;
+  if (auto *FD = dyn_cast_or_null<FunctionDecl>(getTemplatedDecl()))
+    return FD->getTrailingRequiresClause();
+  return false;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Parse/ParseDecl.cpp b/clang/lib/Parse/ParseDecl.cpp
index 514a2ede936e5..c94ccee602879 100644
--- a/clang/lib/Parse/ParseDecl.cpp
+++ b/clang/lib/Parse/ParseDecl.cpp
@@ -2014,6 +2014,9 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
     return nullptr;
   }
 
+  if (Tok.is(tok::kw_requires))
+    ParseTrailingRequiresClause(D);
+
   // Save late-parsed attributes for now; they need to be parsed in the
   // appropriate function scope after the function Decl has been constructed.
   // These will be parsed in ParseFunctionDefinition or ParseLexedAttrList.
@@ -2165,6 +2168,12 @@ Parser::DeclGroupPtrTy Parser::ParseDeclGroup(ParsingDeclSpec &DS,
 
     ParseDeclarator(D);
     if (!D.isInvalidType()) {
+      // C++2a [dcl.decl]p1
+      //    init-declarator:
+      //	      declarator initializer[opt]
+      //        declarator requires-clause
+      if (Tok.is(tok::kw_requires))
+        ParseTrailingRequiresClause(D);
       Decl *ThisDecl = ParseDeclarationAfterDeclarator(D);
       D.complete(ThisDecl);
       if (ThisDecl)
@@ -6032,6 +6041,22 @@ void Parser::ParseDirectDeclarator(Declarator &D) {
       PrototypeScope.Exit();
     } else if (Tok.is(tok::l_square)) {
       ParseBracketDeclarator(D);
+    } else if (Tok.is(tok::kw_requires) && D.hasGroupingParens()) {
+      // This declarator is declaring a function, but the requires clause is
+      // in the wrong place:
+      //   void (f() requires true);
+      // instead of
+      //   void f() requires true;
+      // or
+      //   void (f()) requires true;
+      Diag(Tok, diag::err_requires_clause_inside_parens);
+      ConsumeToken();
+      ExprResult TrailingRequiresClause = Actions.CorrectDelayedTyposInExpr(
+         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+      if (TrailingRequiresClause.isUsable() && D.isFunctionDeclarator() &&
+          !D.hasTrailingRequiresClause())
+        // We're already ill-formed if we got here but we'll accept it anyway.
+        D.setTrailingRequiresClause(TrailingRequiresClause.get());
     } else {
       break;
     }
@@ -6212,6 +6237,47 @@ void Parser::ParseParenDeclarator(Declarator &D) {
   PrototypeScope.Exit();
 }
 
+void Parser::InitCXXThisScopeForDeclaratorIfRelevant(
+    const Declarator &D, const DeclSpec &DS,
+    llvm::Optional<Sema::CXXThisScopeRAII> &ThisScope) {
+  // C++11 [expr.prim.general]p3:
+  //   If a declaration declares a member function or member function
+  //   template of a class X, the expression this is a prvalue of type
+  //   "pointer to cv-qualifier-seq X" between the optional cv-qualifer-seq
+  //   and the end of the function-definition, member-declarator, or
+  //   declarator.
+  // FIXME: currently, "static" case isn't handled correctly.
+  bool IsCXX11MemberFunction = getLangOpts().CPlusPlus11 &&
+        D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
+        (D.getContext() == DeclaratorContext::MemberContext
+         ? !D.getDeclSpec().isFriendSpecified()
+         : D.getContext() == DeclaratorContext::FileContext &&
+           D.getCXXScopeSpec().isValid() &&
+           Actions.CurContext->isRecord());
+  if (!IsCXX11MemberFunction)
+    return;
+
+  Qualifiers Q = Qualifiers::fromCVRUMask(DS.getTypeQualifiers());
+  if (D.getDeclSpec().hasConstexprSpecifier() && !getLangOpts().CPlusPlus14)
+    Q.addConst();
+  // FIXME: Collect C++ address spaces.
+  // If there are multiple different address spaces, the source is invalid.
+  // Carry on using the first addr space for the qualifiers of 'this'.
+  // The diagnostic will be given later while creating the function
+  // prototype for the method.
+  if (getLangOpts().OpenCLCPlusPlus) {
+    for (ParsedAttr &attr : DS.getAttributes()) {
+      LangAS ASIdx = attr.asOpenCLLangAS();
+      if (ASIdx != LangAS::Default) {
+        Q.addAddressSpace(ASIdx);
+        break;
+      }
+    }
+  }
+  ThisScope.emplace(Actions, dyn_cast<CXXRecordDecl>(Actions.CurContext), Q,
+                    IsCXX11MemberFunction);
+}
+
 /// ParseFunctionDeclarator - We are after the identifier and have parsed the
 /// declarator D up to a paren, which indicates that we are parsing function
 /// arguments.
@@ -6225,7 +6291,8 @@ void Parser::ParseParenDeclarator(Declarator &D) {
 ///
 /// For C++, after the parameter-list, it also parses the cv-qualifier-seq[opt],
 /// (C++11) ref-qualifier[opt], exception-specification[opt],
-/// (C++11) attribute-specifier-seq[opt], and (C++11) trailing-return-type[opt].
+/// (C++11) attribute-specifier-seq[opt], (C++11) trailing-return-type[opt] and
+/// (C++2a) the trailing requires-clause.
 ///
 /// [C++11] exception-specification:
 ///           dynamic-exception-specification
@@ -6320,43 +6387,8 @@ void Parser::ParseFunctionDeclarator(Declarator &D,
       if (ParseRefQualifier(RefQualifierIsLValueRef, RefQualifierLoc))
         EndLoc = RefQualifierLoc;
 
-      // C++11 [expr.prim.general]p3:
-      //   If a declaration declares a member function or member function
-      //   template of a class X, the expression this is a prvalue of type
-      //   "pointer to cv-qualifier-seq X" between the optional cv-qualifer-seq
-      //   and the end of the function-definition, member-declarator, or
-      //   declarator.
-      // FIXME: currently, "static" case isn't handled correctly.
-      bool IsCXX11MemberFunction =
-        getLangOpts().CPlusPlus11 &&
-        D.getDeclSpec().getStorageClassSpec() != DeclSpec::SCS_typedef &&
-        (D.getContext() == DeclaratorContext::MemberContext
-         ? !D.getDeclSpec().isFriendSpecified()
-         : D.getContext() == DeclaratorContext::FileContext &&
-           D.getCXXScopeSpec().isValid() &&
-           Actions.CurContext->isRecord());
-
-      Qualifiers Q = Qualifiers::fromCVRUMask(DS.getTypeQualifiers());
-      if (D.getDeclSpec().hasConstexprSpecifier() && !getLangOpts().CPlusPlus14)
-        Q.addConst();
-      // FIXME: Collect C++ address spaces.
-      // If there are multiple different address spaces, the source is invalid.
-      // Carry on using the first addr space for the qualifiers of 'this'.
-      // The diagnostic will be given later while creating the function
-      // prototype for the method.
-      if (getLangOpts().OpenCLCPlusPlus) {
-        for (ParsedAttr &attr : DS.getAttributes()) {
-          LangAS ASIdx = attr.asOpenCLLangAS();
-          if (ASIdx != LangAS::Default) {
-            Q.addAddressSpace(ASIdx);
-            break;
-          }
-        }
-      }
-
-      Sema::CXXThisScopeRAII ThisScope(
-          Actions, dyn_cast<CXXRecordDecl>(Actions.CurContext), Q,
-          IsCXX11MemberFunction);
+      llvm::Optional<Sema::CXXThisScopeRAII> ThisScope;
+      InitCXXThisScopeForDeclaratorIfRelevant(D, DS, ThisScope);
 
       // Parse exception-specification[opt].
       bool Delayed = D.isFirstDeclarationOfMember() &&
@@ -6624,6 +6656,17 @@ void Parser::ParseParameterDeclarationClause(
     // Parse GNU attributes, if present.
     MaybeParseGNUAttributes(ParmDeclarator);
 
+    if (Tok.is(tok::kw_requires)) {
+      // User tried to define a requires clause in a parameter declaration,
+      // which is surely not a function declaration.
+      // void f(int (*g)(int, int) requires true);
+      Diag(Tok,
+           diag::err_requires_clause_on_declarator_not_declaring_a_function);
+      ConsumeToken();
+      Actions.CorrectDelayedTyposInExpr(
+         ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true));
+    }
+
     // Remember this parsed parameter in ParamInfo.
     IdentifierInfo *ParmII = ParmDeclarator.getIdentifier();
 
diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index af3403403c11b..081d4d8b12092 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2301,6 +2301,7 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer(
     LateParsedAttrList &LateParsedAttrs) {
   // member-declarator:
   //   declarator pure-specifier[opt]
+  //   declarator requires-clause
   //   declarator brace-or-equal-initializer[opt]
   //   identifier[opt] ':' constant-expression
   if (Tok.isNot(tok::colon))
@@ -2314,6 +2315,8 @@ bool Parser::ParseCXXMemberDeclaratorBeforeInitializer(
     BitfieldSize = ParseConstantExpression();
     if (BitfieldSize.isInvalid())
       SkipUntil(tok::comma, StopAtSemi | StopBeforeMatch);
+  } else if (Tok.is(tok::kw_requires)) {
+    ParseTrailingRequiresClause(DeclaratorInfo);
   } else {
     ParseOptionalCXX11VirtSpecifierSeq(
         VS, getCurrentClass().IsInterface,
@@ -2436,6 +2439,7 @@ void Parser::MaybeParseAndDiagnoseDeclSpecAfterCXX11VirtSpecifierSeq(
 ///
 ///       member-declarator:
 ///         declarator virt-specifier-seq[opt] pure-specifier[opt]
+/// [C++2a] declarator requires-clause
 ///         declarator constant-initializer[opt]
 /// [C++11] declarator brace-or-equal-initializer[opt]
 ///         identifier[opt] ':' constant-expression
@@ -2669,6 +2673,7 @@ Parser::ParseCXXClassMemberDeclaration(AccessSpecifier AS,
 
   SmallVector<Decl *, 8> DeclsInGroup;
   ExprResult BitfieldSize;
+  ExprResult TrailingRequiresClause;
   bool ExpectSemi = true;
 
   // Parse the first declarator.
@@ -3793,6 +3798,62 @@ TypeResult Parser::ParseTrailingReturnType(SourceRange &Range,
                                    : DeclaratorContext::TrailingReturnContext);
 }
 
+/// Parse a requires-clause as part of a function declaration.
+void Parser::ParseTrailingRequiresClause(Declarator &D) {
+  assert(Tok.is(tok::kw_requires) && "expected requires");
+
+  SourceLocation RequiresKWLoc = ConsumeToken();
+
+  ExprResult TrailingRequiresClause;
+  ParseScope ParamScope(this,
+                        Scope::DeclScope |
+                        Scope::FunctionDeclarationScope |
+                        Scope::FunctionPrototypeScope);
+
+  Actions.ActOnStartTrailingRequiresClause(getCurScope(), D);
+
+  llvm::Optional<Sema::CXXThisScopeRAII> ThisScope;
+  InitCXXThisScopeForDeclaratorIfRelevant(D, D.getDeclSpec(), ThisScope);
+
+  TrailingRequiresClause =
+      ParseConstraintLogicalOrExpression(/*IsTrailingRequiresClause=*/true);
+
+  TrailingRequiresClause =
+      Actions.ActOnFinishTrailingRequiresClause(TrailingRequiresClause);
+
+  if (!D.isDeclarationOfFunction()) {
+    Diag(RequiresKWLoc,
+         diag::err_requires_clause_on_declarator_not_declaring_a_function);
+    return;
+  }
+
+  if (TrailingRequiresClause.isInvalid())
+    SkipUntil({tok::l_brace, tok::arrow, tok::kw_try, tok::comma, tok::colon},
+              StopAtSemi | StopBeforeMatch);
+  else
+    D.setTrailingRequiresClause(TrailingRequiresClause.get());
+
+  // Did the user swap the trailing return type and requires clause?
+  if (D.isFunctionDeclarator() && Tok.is(tok::arrow) &&
+      D.getDeclSpec().getTypeSpecType() == TST_auto) {
+    SourceLocation ArrowLoc = Tok.getLocation();
+    SourceRange Range;
+    TypeResult TrailingReturnType =
+        ParseTrailingReturnType(Range, /*MayBeFollowedByDirectInit=*/false);
+
+    if (!TrailingReturnType.isInvalid()) {
+      Diag(ArrowLoc,
+           diag::err_requires_clause_must_appear_after_trailing_return)
+          << Range;
+      auto &FunctionChunk = D.getFunctionTypeInfo();
+      FunctionChunk.HasTrailingReturnType = TrailingReturnType.isUsable();
+      FunctionChunk.TrailingReturnType = TrailingReturnType.get();
+    } else
+      SkipUntil({tok::equal, tok::l_brace, tok::arrow, tok::kw_try, tok::comma},
+                StopAtSemi | StopBeforeMatch);
+  }
+}
+
 /// We have just started parsing the definition of a new class,
 /// so push that class onto our stack of classes that is currently
 /// being parsed.
diff --git a/clang/lib/Parse/ParseExpr.cpp b/clang/lib/Parse/ParseExpr.cpp
index b74a95a3cd4b6..067a77a592f2f 100644
--- a/clang/lib/Parse/ParseExpr.cpp
+++ b/clang/lib/Parse/ParseExpr.cpp
@@ -22,6 +22,7 @@
 
 #include "clang/Parse/Parser.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/Basic/PrettyStackTrace.h"
 #include "clang/Parse/RAIIObjectsForParser.h"
 #include "clang/Sema/DeclSpec.h"
@@ -145,7 +146,7 @@ Parser::ParseExpressionWithLeadingExtension(SourceLocation ExtLoc) {
     // Silence extension warnings in the sub-expression
     ExtensionRAIIObject O(Diags);
 
-    LHS = ParseCastExpression(false);
+    LHS = ParseCastExpression(AnyCastExpr);
   }
 
   if (!LHS.isInvalid())
@@ -169,7 +170,7 @@ ExprResult Parser::ParseAssignmentExpression(TypeCastState isTypeCast) {
   if (Tok.is(tok::kw_co_yield))
     return ParseCoyieldExpression();
 
-  ExprResult LHS = ParseCastExpression(/*isUnaryExpression=*/false,
+  ExprResult LHS = ParseCastExpression(AnyCastExpr,
                                        /*isAddressOfOperand=*/false,
                                        isTypeCast);
   return ParseRHSOfBinaryExpression(LHS, prec::Assignment);
@@ -202,7 +203,7 @@ Parser::ParseConstantExpressionInExprEvalContext(TypeCastState isTypeCast) {
              Sema::ExpressionEvaluationContext::ConstantEvaluated &&
          "Call this function only if your ExpressionEvaluationContext is "
          "already ConstantEvaluated");
-  ExprResult LHS(ParseCastExpression(false, false, isTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, false, isTypeCast));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   return Actions.ActOnConstantExpression(Res);
 }
@@ -220,7 +221,7 @@ ExprResult Parser::ParseConstantExpression(TypeCastState isTypeCast) {
 ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) {
   EnterExpressionEvaluationContext ConstantEvaluated(
       Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  ExprResult LHS(ParseCastExpression(false, false, NotTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, false, NotTypeCast));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   return Actions.ActOnCaseExpr(CaseLoc, Res);
 }
@@ -234,13 +235,143 @@ ExprResult Parser::ParseCaseExpression(SourceLocation CaseLoc) {
 ExprResult Parser::ParseConstraintExpression() {
   EnterExpressionEvaluationContext ConstantEvaluated(
       Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
-  ExprResult LHS(ParseCastExpression(/*isUnaryExpression=*/false));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr));
   ExprResult Res(ParseRHSOfBinaryExpression(LHS, prec::LogicalOr));
-  if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get()))
+  if (Res.isUsable() && !Actions.CheckConstraintExpression(Res.get())) {
+    Actions.CorrectDelayedTyposInExpr(Res);
     return ExprError();
+  }
   return Res;
 }
 
+/// \brief Parse a constraint-logical-and-expression.
+///
+/// \param RightMostExpr If provided, will receive the right-most atomic
+///                      constraint that was parsed.
+/// \verbatim
+///       C++2a[temp.constr.decl]p1
+///       constraint-logical-and-expression:
+///         primary-expression
+///         constraint-logical-and-expression '&&' primary-expression
+///
+/// \endverbatim
+ExprResult
+Parser::ParseConstraintLogicalAndExpression(bool IsTrailingRequiresClause) {
+  EnterExpressionEvaluationContext ConstantEvaluated(
+      Actions, Sema::ExpressionEvaluationContext::ConstantEvaluated);
+  bool NotPrimaryExpression = false;
+  auto ParsePrimary = [&] () {
+    ExprResult E = ParseCastExpression(PrimaryExprOnly,
+                                       /*isAddressOfOperand=*/false,
+                                       /*isTypeCast=*/NotTypeCast,
+                                       /*isVectorLiteral=*/false,
+                                       &NotPrimaryExpression);
+    if (E.isInvalid())
+      return ExprError();
+    auto RecoverFromNonPrimary = [&] (ExprResult E, bool Note) {
+        E = ParsePostfixExpressionSuffix(E);
+        // Use InclusiveOr, the precedence just after '&&' to not parse the
+        // next arguments to the logical and.
+        E = ParseRHSOfBinaryExpression(E, prec::InclusiveOr);
+        if (!E.isInvalid())
+          Diag(E.get()->getExprLoc(),
+               Note
+               ? diag::note_unparenthesized_non_primary_expr_in_requires_clause
+               : diag::err_unparenthesized_non_primary_expr_in_requires_clause)
+               << FixItHint::CreateInsertion(E.get()->getBeginLoc(), "(")
+               << FixItHint::CreateInsertion(
+                   PP.getLocForEndOfToken(E.get()->getEndLoc()), ")")
+               << E.get()->getSourceRange();
+        return E;
+    };
+
+    if (NotPrimaryExpression ||
+        // Check if the following tokens must be a part of a non-primary
+        // expression
+        getBinOpPrecedence(Tok.getKind(), GreaterThanIsOperator,
+                           /*CPlusPlus11=*/true) > prec::LogicalAnd ||
+        // Postfix operators other than '(' (which will be checked for in
+        // CheckConstraintExpression).
+        Tok.isOneOf(tok::period, tok::plusplus, tok::minusminus) ||
+        (Tok.is(tok::l_square) && !NextToken().is(tok::l_square))) {
+      E = RecoverFromNonPrimary(E, /*Note=*/false);
+      if (E.isInvalid())
+        return ExprError();
+      NotPrimaryExpression = false;
+    }
+    bool PossibleNonPrimary;
+    bool IsConstraintExpr =
+        Actions.CheckConstraintExpression(E.get(), Tok, &PossibleNonPrimary,
+                                          IsTrailingRequiresClause);
+    if (!IsConstraintExpr || PossibleNonPrimary) {
+      // Atomic constraint might be an unparenthesized non-primary expression
+      // (such as a binary operator), in which case we might get here (e.g. in
+      // 'requires 0 + 1 && true' we would now be at '+', and parse and ignore
+      // the rest of the addition expression). Try to parse the rest of it here.
+      if (PossibleNonPrimary)
+        E = RecoverFromNonPrimary(E, /*Note=*/!IsConstraintExpr);
+      Actions.CorrectDelayedTyposInExpr(E);
+      return ExprError();
+    }
+    return E;
+  };
+  ExprResult LHS = ParsePrimary();
+  if (LHS.isInvalid())
+    return ExprError();
+  while (Tok.is(tok::ampamp)) {
+    SourceLocation LogicalAndLoc = ConsumeToken();
+    ExprResult RHS = ParsePrimary();
+    if (RHS.isInvalid()) {
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalAndLoc,
+                                       tok::ampamp, LHS.get(), RHS.get());
+    if (!Op.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(RHS);
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    LHS = Op;
+  }
+  return LHS;
+}
+
+/// \brief Parse a constraint-logical-or-expression.
+///
+/// \verbatim
+///       C++2a[temp.constr.decl]p1
+///       constraint-logical-or-expression:
+///         constraint-logical-and-expression
+///         constraint-logical-or-expression '||'
+///             constraint-logical-and-expression
+///
+/// \endverbatim
+ExprResult
+Parser::ParseConstraintLogicalOrExpression(bool IsTrailingRequiresClause) {
+  ExprResult LHS(ParseConstraintLogicalAndExpression(IsTrailingRequiresClause));
+  if (!LHS.isUsable())
+    return ExprError();
+  while (Tok.is(tok::pipepipe)) {
+    SourceLocation LogicalOrLoc = ConsumeToken();
+    ExprResult RHS =
+        ParseConstraintLogicalAndExpression(IsTrailingRequiresClause);
+    if (!RHS.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    ExprResult Op = Actions.ActOnBinOp(getCurScope(), LogicalOrLoc,
+                                       tok::pipepipe, LHS.get(), RHS.get());
+    if (!Op.isUsable()) {
+      Actions.CorrectDelayedTyposInExpr(RHS);
+      Actions.CorrectDelayedTyposInExpr(LHS);
+      return ExprError();
+    }
+    LHS = Op;
+  }
+  return LHS;
+}
+
 bool Parser::isNotExpressionStart() {
   tok::TokenKind K = Tok.getKind();
   if (K == tok::l_brace || K == tok::r_brace  ||
@@ -414,7 +545,7 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
     } else if (getLangOpts().CPlusPlus && NextTokPrec <= prec::Conditional)
       RHS = ParseAssignmentExpression();
     else
-      RHS = ParseCastExpression(false);
+      RHS = ParseCastExpression(AnyCastExpr);
 
     if (RHS.isInvalid()) {
       // FIXME: Errors generated by the delayed typo correction should be
@@ -519,22 +650,24 @@ Parser::ParseRHSOfBinaryExpression(ExprResult LHS, prec::Level MinPrec) {
   }
 }
 
-/// Parse a cast-expression, or, if \p isUnaryExpression is true,
-/// parse a unary-expression.
+/// Parse a cast-expression, unary-expression or primary-expression, based
+/// on \p ExprType.
 ///
 /// \p isAddressOfOperand exists because an id-expression that is the
 /// operand of address-of gets special treatment due to member pointers.
 ///
-ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
+ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
                                        bool isAddressOfOperand,
                                        TypeCastState isTypeCast,
-                                       bool isVectorLiteral) {
+                                       bool isVectorLiteral,
+                                       bool *NotPrimaryExpression) {
   bool NotCastExpr;
-  ExprResult Res = ParseCastExpression(isUnaryExpression,
+  ExprResult Res = ParseCastExpression(ParseKind,
                                        isAddressOfOperand,
                                        NotCastExpr,
                                        isTypeCast,
-                                       isVectorLiteral);
+                                       isVectorLiteral,
+                                       NotPrimaryExpression);
   if (NotCastExpr)
     Diag(Tok, diag::err_expected_expression);
   return Res;
@@ -759,11 +892,12 @@ class CastExpressionIdValidator final : public CorrectionCandidateCallback {
 ///                   '__is_rvalue_expr'
 /// \endverbatim
 ///
-ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
+ExprResult Parser::ParseCastExpression(CastParseKind ParseKind,
                                        bool isAddressOfOperand,
                                        bool &NotCastExpr,
                                        TypeCastState isTypeCast,
-                                       bool isVectorLiteral) {
+                                       bool isVectorLiteral,
+                                       bool *NotPrimaryExpression) {
   ExprResult Res;
   tok::TokenKind SavedKind = Tok.getKind();
   auto SavedType = PreferredType;
@@ -782,11 +916,21 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   // ParsePostfixExpressionSuffix.
   switch (SavedKind) {
   case tok::l_paren: {
-    // If this expression is limited to being a unary-expression, the parent can
+    // If this expression is limited to being a unary-expression, the paren can
     // not start a cast expression.
-    ParenParseOption ParenExprType =
-        (isUnaryExpression && !getLangOpts().CPlusPlus) ? CompoundLiteral
-                                                        : CastExpr;
+    ParenParseOption ParenExprType;
+    switch (ParseKind) {
+      case CastParseKind::UnaryExprOnly:
+        if (!getLangOpts().CPlusPlus)
+          ParenExprType = CompoundLiteral;
+        LLVM_FALLTHROUGH;
+      case CastParseKind::AnyCastExpr:
+        ParenExprType = ParenParseOption::CastExpr;
+        break;
+      case CastParseKind::PrimaryExprOnly:
+        ParenExprType = FoldExpr;
+        break;
+    }
     ParsedType CastTy;
     SourceLocation RParenLoc;
     Res = ParseParenExpression(ParenExprType, false/*stopIfCastExr*/,
@@ -861,8 +1005,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     assert(Tok.isNot(tok::kw_decltype) && Tok.isNot(tok::kw___super));
-    return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
-
+    return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                               isVectorLiteral, NotPrimaryExpression);
+      
   case tok::identifier: {      // primary-expression: identifier
                                // unqualified-id: identifier
                                // constant: enumeration-constant
@@ -949,8 +1094,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
           = RevertibleTypeTraits.find(II);
         if (Known != RevertibleTypeTraits.end()) {
           Tok.setKind(Known->second);
-          return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                     NotCastExpr, isTypeCast);
+          return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                     NotCastExpr, isTypeCast,
+                                     isVectorLiteral, NotPrimaryExpression);
         }
       }
 
@@ -961,7 +1107,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         if (TryAnnotateTypeOrScopeToken())
           return ExprError();
         if (!Tok.is(tok::identifier))
-          return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
+          return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                     NotCastExpr, isTypeCast,
+                                     isVectorLiteral,
+                                     NotPrimaryExpression);
       }
     }
 
@@ -1076,8 +1225,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         Tok.is(tok::r_paren) ? nullptr : &Replacement);
     if (!Res.isInvalid() && Res.isUnset()) {
       UnconsumeToken(Replacement);
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                 NotCastExpr, isTypeCast,
+                                 /*isVectorLiteral=*/false,
+                                 NotPrimaryExpression);
     }
     if (!Res.isInvalid() && Tok.is(tok::less))
       checkPotentialAngleBracket(Res);
@@ -1122,12 +1273,16 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw___builtin_FILE:
   case tok::kw___builtin_FUNCTION:
   case tok::kw___builtin_LINE:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseBuiltinPrimaryExpression();
   case tok::kw___null:
     return Actions.ActOnGNUNullExpr(ConsumeToken());
 
   case tok::plusplus:      // unary-expression: '++' unary-expression [C99]
   case tok::minusminus: {  // unary-expression: '--' unary-expression [C99]
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     // C++ [expr.unary] has:
     //   unary-expression:
     //     ++ cast-expression
@@ -1140,7 +1295,8 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     // One special case is implicitly handled here: if the preceding tokens are
     // an ambiguous cast expression, such as "(T())++", then we recurse to
     // determine whether the '++' is prefix or postfix.
-    Res = ParseCastExpression(!getLangOpts().CPlusPlus,
+    Res = ParseCastExpression(getLangOpts().CPlusPlus ?
+                                  UnaryExprOnly : AnyCastExpr,
                               /*isAddressOfOperand*/false, NotCastExpr,
                               NotTypeCast);
     if (NotCastExpr) {
@@ -1156,10 +1312,12 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     return Res;
   }
   case tok::amp: {         // unary-expression: '&' cast-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     // Special treatment because of member pointers
     SourceLocation SavedLoc = ConsumeToken();
     PreferredType.enterUnary(Actions, Tok.getLocation(), tok::amp, SavedLoc);
-    Res = ParseCastExpression(false, true);
+    Res = ParseCastExpression(AnyCastExpr, true);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
@@ -1172,17 +1330,21 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::exclaim:       // unary-expression: '!' cast-expression
   case tok::kw___real:     // unary-expression: '__real' cast-expression [GNU]
   case tok::kw___imag: {   // unary-expression: '__imag' cast-expression [GNU]
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation SavedLoc = ConsumeToken();
     PreferredType.enterUnary(Actions, Tok.getLocation(), SavedKind, SavedLoc);
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
   }
 
   case tok::kw_co_await: {  // unary-expression: 'co_await' cast-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation CoawaitLoc = ConsumeToken();
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnCoawaitExpr(getCurScope(), CoawaitLoc, Res.get());
     return Res;
@@ -1190,9 +1352,11 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
 
   case tok::kw___extension__:{//unary-expression:'__extension__' cast-expr [GNU]
     // __extension__ silences extension warnings in the subexpression.
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     ExtensionRAIIObject O(Diags);  // Use RAII to do this.
     SourceLocation SavedLoc = ConsumeToken();
-    Res = ParseCastExpression(false);
+    Res = ParseCastExpression(AnyCastExpr);
     if (!Res.isInvalid())
       Res = Actions.ActOnUnaryOp(getCurScope(), SavedLoc, SavedKind, Res.get());
     return Res;
@@ -1209,8 +1373,12 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw_vec_step:   // unary-expression: OpenCL 'vec_step' expression
   // unary-expression: '__builtin_omp_required_simd_align' '(' type-name ')'
   case tok::kw___builtin_omp_required_simd_align:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseUnaryExprOrTypeTraitExpression();
   case tok::ampamp: {      // unary-expression: '&&' identifier
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation AmpAmpLoc = ConsumeToken();
     if (Tok.isNot(tok::identifier))
       return ExprError(Diag(Tok, diag::err_expected) << tok::identifier);
@@ -1229,15 +1397,23 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   case tok::kw_dynamic_cast:
   case tok::kw_reinterpret_cast:
   case tok::kw_static_cast:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXCasts();
     break;
   case tok::kw___builtin_bit_cast:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseBuiltinBitCast();
     break;
   case tok::kw_typeid:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXTypeid();
     break;
   case tok::kw___uuidof:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Res = ParseCXXUuidof();
     break;
   case tok::kw_this:
@@ -1302,6 +1478,10 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
       return ExprError();
     }
 
+    // Everything henceforth is a postfix-expression.
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
+
     if (SavedKind == tok::kw_typename) {
       // postfix-expression: typename-specifier '(' expression-list[opt] ')'
       //                     typename-specifier braced-init-list
@@ -1338,8 +1518,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     if (!Tok.is(tok::annot_cxxscope))
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr,
+                                 isTypeCast, isVectorLiteral,
+                                 NotPrimaryExpression);
 
     Token Next = NextToken();
     if (Next.is(tok::annot_template_id)) {
@@ -1352,8 +1533,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         ParseOptionalCXXScopeSpecifier(SS, nullptr,
                                        /*EnteringContext=*/false);
         AnnotateTemplateIdTokenAsType();
-        return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                   NotCastExpr, isTypeCast);
+        return ParseCastExpression(ParseKind, isAddressOfOperand, NotCastExpr,
+                                   isTypeCast, isVectorLiteral,
+                                   NotPrimaryExpression);
       }
     }
 
@@ -1369,8 +1551,9 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
       // translate it into a type and continue parsing as a cast
       // expression.
       AnnotateTemplateIdTokenAsType();
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand,
-                                 NotCastExpr, isTypeCast);
+      return ParseCastExpression(ParseKind, isAddressOfOperand,
+                                 NotCastExpr, isTypeCast, isVectorLiteral,
+                                 NotPrimaryExpression);
     }
 
     // Fall through to treat the template-id as an id-expression.
@@ -1387,15 +1570,22 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
     if (TryAnnotateTypeOrScopeToken())
       return ExprError();
     if (!Tok.is(tok::coloncolon))
-      return ParseCastExpression(isUnaryExpression, isAddressOfOperand);
+      return ParseCastExpression(ParseKind, isAddressOfOperand, isTypeCast,
+                                 isVectorLiteral, NotPrimaryExpression);
 
     // ::new -> [C++] new-expression
     // ::delete -> [C++] delete-expression
     SourceLocation CCLoc = ConsumeToken();
-    if (Tok.is(tok::kw_new))
+    if (Tok.is(tok::kw_new)) {
+      if (NotPrimaryExpression)
+        *NotPrimaryExpression = true;
       return ParseCXXNewExpression(true, CCLoc);
-    if (Tok.is(tok::kw_delete))
+    }
+    if (Tok.is(tok::kw_delete)) {
+      if (NotPrimaryExpression)
+        *NotPrimaryExpression = true;
       return ParseCXXDeleteExpression(true, CCLoc);
+    }
 
     // This is not a type name or scope specifier, it is an invalid expression.
     Diag(CCLoc, diag::err_expected_expression);
@@ -1403,12 +1593,18 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   }
 
   case tok::kw_new: // [C++] new-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseCXXNewExpression(false, Tok.getLocation());
 
   case tok::kw_delete: // [C++] delete-expression
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseCXXDeleteExpression(false, Tok.getLocation());
 
   case tok::kw_noexcept: { // [C++0x] 'noexcept' '(' expression ')'
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     Diag(Tok, diag::warn_cxx98_compat_noexcept_expr);
     SourceLocation KeyLoc = ConsumeToken();
     BalancedDelimiterTracker T(*this, tok::l_paren);
@@ -1437,13 +1633,19 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
 
   case tok::kw___array_rank:
   case tok::kw___array_extent:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseArrayTypeTrait();
 
   case tok::kw___is_lvalue_expr:
   case tok::kw___is_rvalue_expr:
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     return ParseExpressionTrait();
 
   case tok::at: {
+    if (NotPrimaryExpression)
+      *NotPrimaryExpression = true;
     SourceLocation AtLoc = ConsumeToken();
     return ParseObjCAtExpression(AtLoc);
   }
@@ -1465,8 +1667,13 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
         // expression, or we have something that doesn't appear to be a lambda.
         // If we're in the last case, we fall back to ParseObjCMessageExpression.
         Res = TryParseLambdaExpression();
-        if (!Res.isInvalid() && !Res.get())
+        if (!Res.isInvalid() && !Res.get()) {
+          // We assume Objective-C++ message expressions are not
+          // primary-expressions.
+          if (NotPrimaryExpression)
+            *NotPrimaryExpression = true;
           Res = ParseObjCMessageExpression();
+        }
         break;
       }
       Res = ParseLambdaExpression();
@@ -1486,6 +1693,11 @@ ExprResult Parser::ParseCastExpression(bool isUnaryExpression,
   // are compiling for OpenCL, we need to return an error as this implies
   // that the address of the function is being taken, which is illegal in CL.
 
+  if (ParseKind == PrimaryExprOnly)
+    // This is strictly a primary-expression - no postfix-expr pieces should be
+    // parsed.
+    return Res;
+
   // These can be followed by postfix-expr pieces.
   PreferredType = SavedType;
   Res = ParsePostfixExpressionSuffix(Res);
@@ -1929,7 +2141,7 @@ Parser::ParseExprAfterUnaryExprOrTypeTrait(const Token &OpTok,
       return ExprError();
     }
 
-    Operand = ParseCastExpression(true/*isUnaryExpression*/);
+    Operand = ParseCastExpression(UnaryExprOnly);
   } else {
     // If it starts with a '(', we know that it is either a parenthesized
     // type-name, or it is a unary-expression that starts with a compound
@@ -2474,8 +2686,8 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
     RParenLoc = T.getCloseLocation();
 
     PreferredType.enterTypeCast(Tok.getLocation(), Ty.get().get());
-    ExprResult SubExpr = ParseCastExpression(/*isUnaryExpression=*/false);
-
+    ExprResult SubExpr = ParseCastExpression(AnyCastExpr);
+    
     if (Ty.isInvalid() || SubExpr.isInvalid())
       return ExprError();
 
@@ -2555,7 +2767,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
             // Parse the cast-expression that follows it next.
             // isVectorLiteral = true will make sure we don't parse any
             // Postfix expression yet
-            Result = ParseCastExpression(/*isUnaryExpression=*/false,
+            Result = ParseCastExpression(/*isUnaryExpression=*/AnyCastExpr,
                                          /*isAddressOfOperand=*/false,
                                          /*isTypeCast=*/IsTypeCast,
                                          /*isVectorLiteral=*/true);
@@ -2607,7 +2819,7 @@ Parser::ParseParenExpression(ParenParseOption &ExprType, bool stopIfCastExpr,
         PreferredType.enterTypeCast(Tok.getLocation(), CastTy.get());
         // Parse the cast-expression that follows it next.
         // TODO: For cast expression with CastTy.
-        Result = ParseCastExpression(/*isUnaryExpression=*/false,
+        Result = ParseCastExpression(/*isUnaryExpression=*/AnyCastExpr,
                                      /*isAddressOfOperand=*/false,
                                      /*isTypeCast=*/IsTypeCast);
         if (!Result.isInvalid()) {
diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp
index a39998482e956..f4ffa08b2a1b7 100644
--- a/clang/lib/Parse/ParseExprCXX.cpp
+++ b/clang/lib/Parse/ParseExprCXX.cpp
@@ -1371,10 +1371,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
         DeclEndLoc = Range.getEnd();
     }
 
-    PrototypeScope.Exit();
-
-    WarnIfHasCUDATargetAttr();
-
     SourceLocation NoLoc;
     D.AddTypeInfo(DeclaratorChunk::getFunction(
                       /*HasProto=*/true,
@@ -1389,13 +1385,22 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       /*DeclsInPrototype=*/None, LParenLoc, FunLocalRangeEnd, D,
                       TrailingReturnType, &DS),
                   std::move(Attr), DeclEndLoc);
+
+    // Parse requires-clause[opt].
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(D);
+
+    PrototypeScope.Exit();
+
+    WarnIfHasCUDATargetAttr();
   } else if (Tok.isOneOf(tok::kw_mutable, tok::arrow, tok::kw___attribute,
                          tok::kw_constexpr, tok::kw_consteval,
                          tok::kw___private, tok::kw___global, tok::kw___local,
-                         tok::kw___constant, tok::kw___generic) ||
+                         tok::kw___constant, tok::kw___generic,
+                         tok::kw_requires) ||
              (Tok.is(tok::l_square) && NextToken().is(tok::l_square))) {
     // It's common to forget that one needs '()' before 'mutable', an attribute
-    // specifier, or the result type. Deal with this.
+    // specifier, the result type, or the requires clause. Deal with this.
     unsigned TokKind = 0;
     switch (Tok.getKind()) {
     case tok::kw_mutable: TokKind = 0; break;
@@ -1409,6 +1414,7 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
     case tok::l_square: TokKind = 2; break;
     case tok::kw_constexpr: TokKind = 3; break;
     case tok::kw_consteval: TokKind = 4; break;
+    case tok::kw_requires: TokKind = 5; break;
     default: llvm_unreachable("Unknown token kind");
     }
 
@@ -1440,8 +1446,6 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
         DeclEndLoc = Range.getEnd();
     }
 
-    WarnIfHasCUDATargetAttr();
-
     SourceLocation NoLoc;
     D.AddTypeInfo(DeclaratorChunk::getFunction(
                       /*HasProto=*/true,
@@ -1462,6 +1466,12 @@ ExprResult Parser::ParseLambdaExpressionAfterIntroducer(
                       /*DeclsInPrototype=*/None, DeclLoc, DeclEndLoc, D,
                       TrailingReturnType),
                   std::move(Attr), DeclEndLoc);
+
+    // Parse the requires-clause, if present.
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(D);
+
+    WarnIfHasCUDATargetAttr();
   }
 
   // FIXME: Rename BlockScope -> ClosureScope if we decide to continue using
@@ -3238,7 +3248,7 @@ Parser::ParseCXXDeleteExpression(bool UseGlobal, SourceLocation Start) {
       return ExprError();
   }
 
-  ExprResult Operand(ParseCastExpression(false));
+  ExprResult Operand(ParseCastExpression(AnyCastExpr));
   if (Operand.isInvalid())
     return Operand;
 
@@ -3469,7 +3479,7 @@ Parser::ParseCXXAmbiguousParenExpression(ParenParseOption &ExprType,
       // If it is not a cast-expression, NotCastExpr will be true and no token
       // will be consumed.
       ColonProt.restore();
-      Result = ParseCastExpression(false/*isUnaryExpression*/,
+      Result = ParseCastExpression(AnyCastExpr,
                                    false/*isAddressofOperand*/,
                                    NotCastExpr,
                                    // type-id has priority.
diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp
index 24855df334f4c..1095919baa7d3 100644
--- a/clang/lib/Parse/ParseOpenMP.cpp
+++ b/clang/lib/Parse/ParseOpenMP.cpp
@@ -2264,8 +2264,8 @@ ExprResult Parser::ParseOpenMPParensExpr(StringRef ClauseName,
     return ExprError();
 
   SourceLocation ELoc = Tok.getLocation();
-  ExprResult LHS(ParseCastExpression(
-      /*isUnaryExpression=*/false, IsAddressOfOperand, NotTypeCast));
+  ExprResult LHS(ParseCastExpression(AnyCastExpr, IsAddressOfOperand,
+                                     NotTypeCast));
   ExprResult Val(ParseRHSOfBinaryExpression(LHS, prec::Conditional));
   Val = Actions.ActOnFinishFullExpr(Val.get(), ELoc, /*DiscardedValue*/ false);
 
@@ -2513,7 +2513,7 @@ OMPClause *Parser::ParseOpenMPSingleExprWithArgClause(OpenMPClauseKind Kind,
                           Kind == OMPC_if;
   if (NeedAnExpression) {
     SourceLocation ELoc = Tok.getLocation();
-    ExprResult LHS(ParseCastExpression(false, false, NotTypeCast));
+    ExprResult LHS(ParseCastExpression(AnyCastExpr, false, NotTypeCast));
     Val = ParseRHSOfBinaryExpression(LHS, prec::Conditional);
     Val =
         Actions.ActOnFinishFullExpr(Val.get(), ELoc, /*DiscardedValue*/ false);
diff --git a/clang/lib/Parse/ParseTemplate.cpp b/clang/lib/Parse/ParseTemplate.cpp
index 928bc5aa25b35..35cee596bb016 100644
--- a/clang/lib/Parse/ParseTemplate.cpp
+++ b/clang/lib/Parse/ParseTemplate.cpp
@@ -130,7 +130,9 @@ Decl *Parser::ParseTemplateDeclarationOrSpecialization(
 
       if (TryConsumeToken(tok::kw_requires)) {
         OptionalRequiresClauseConstraintER =
-            Actions.CorrectDelayedTyposInExpr(ParseConstraintExpression());
+            Actions.CorrectDelayedTyposInExpr(
+                ParseConstraintLogicalOrExpression(
+                    /*IsTrailingRequiresClause=*/false));
         if (!OptionalRequiresClauseConstraintER.isUsable()) {
           // Skip until the semi-colon or a '}'.
           SkipUntil(tok::r_brace, StopAtSemi | StopBeforeMatch);
@@ -254,8 +256,12 @@ Decl *Parser::ParseSingleDeclarationAfterTemplate(
   });
 
   LateParsedAttrList LateParsedAttrs(true);
-  if (DeclaratorInfo.isFunctionDeclarator())
+  if (DeclaratorInfo.isFunctionDeclarator()) {
+    if (Tok.is(tok::kw_requires))
+      ParseTrailingRequiresClause(DeclaratorInfo);
+
     MaybeParseGNUAttributes(DeclaratorInfo, &LateParsedAttrs);
+  }
 
   if (DeclaratorInfo.isFunctionDeclarator() &&
       isStartOfFunctionDefinition(DeclaratorInfo)) {
diff --git a/clang/lib/Parse/ParseTentative.cpp b/clang/lib/Parse/ParseTentative.cpp
index 9cc41328c469d..4d69fb4693fb8 100644
--- a/clang/lib/Parse/ParseTentative.cpp
+++ b/clang/lib/Parse/ParseTentative.cpp
@@ -1031,6 +1031,10 @@ Parser::TPResult Parser::TryParseDeclarator(bool mayBeAbstract,
       // direct-declarator '[' constant-expression[opt] ']'
       // direct-abstract-declarator[opt] '[' constant-expression[opt] ']'
       TPR = TryParseBracketDeclarator();
+    } else if (Tok.is(tok::kw_requires)) {
+      // declarator requires-clause
+      // A requires clause indicates a function declaration.
+      TPR = TPResult::True;
     } else {
       break;
     }
@@ -2014,7 +2018,6 @@ Parser::TryParseParameterDeclarationClause(bool *InvalidAsDeclaration,
 ///   'throw' '(' type-id-list[opt] ')'
 ///
 Parser::TPResult Parser::TryParseFunctionDeclarator() {
-
   // The '(' is already parsed.
 
   TPResult TPR = TryParseParameterDeclarationClause();
diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp
index 6216206690b0c..a905ebc673056 100644
--- a/clang/lib/Sema/SemaCast.cpp
+++ b/clang/lib/Sema/SemaCast.cpp
@@ -2008,7 +2008,7 @@ static bool fixOverloadedReinterpretCastExpr(Sema &Self, QualType DestType,
   // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
   // preserves Result.
   Result = E;
-  if (!Self.resolveAndFixAddressOfOnlyViableOverloadCandidate(
+  if (!Self.resolveAndFixAddressOfSingleOverloadCandidate(
           Result, /*DoFunctionPointerConversion=*/true))
     return false;
   return Result.isUsable();
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 7f0bdc9b47822..018ac2d7dc9d1 100755
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "clang/Sema/SemaConcept.h"
 #include "clang/Sema/Sema.h"
 #include "clang/Sema/SemaInternal.h"
 #include "clang/Sema/SemaDiagnostic.h"
@@ -18,12 +19,16 @@
 #include "clang/Sema/Template.h"
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/RecursiveASTVisitor.h"
+#include "clang/Basic/OperatorPrecedence.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PointerUnion.h"
 using namespace clang;
 using namespace sema;
 
-bool Sema::CheckConstraintExpression(Expr *ConstraintExpression) {
+bool
+Sema::CheckConstraintExpression(Expr *ConstraintExpression, Token NextToken,
+                                bool *PossibleNonPrimary,
+                                bool IsTrailingRequiresClause) {
   // C++2a [temp.constr.atomic]p1
   // ..E shall be a constant expression of type bool.
 
@@ -31,22 +36,56 @@ bool Sema::CheckConstraintExpression(Expr *ConstraintExpression) {
 
   if (auto *BinOp = dyn_cast<BinaryOperator>(ConstraintExpression)) {
     if (BinOp->getOpcode() == BO_LAnd || BinOp->getOpcode() == BO_LOr)
-      return CheckConstraintExpression(BinOp->getLHS()) &&
-             CheckConstraintExpression(BinOp->getRHS());
+      return CheckConstraintExpression(BinOp->getLHS(), NextToken,
+                                       PossibleNonPrimary) &&
+             CheckConstraintExpression(BinOp->getRHS(), NextToken,
+                                       PossibleNonPrimary);
   } else if (auto *C = dyn_cast<ExprWithCleanups>(ConstraintExpression))
-    return CheckConstraintExpression(C->getSubExpr());
+    return CheckConstraintExpression(C->getSubExpr(), NextToken,
+                                     PossibleNonPrimary);
+
+  QualType Type = ConstraintExpression->getType();
+
+  auto CheckForNonPrimary = [&] {
+    if (PossibleNonPrimary)
+      *PossibleNonPrimary =
+          // We have the following case:
+          // template<typename> requires func(0) struct S { };
+          // The user probably isn't aware of the parentheses required around
+          // the function call, and we're only going to parse 'func' as the
+          // primary-expression, and complain that it is of non-bool type.
+          (NextToken.is(tok::l_paren) &&
+           (IsTrailingRequiresClause ||
+            (Type->isDependentType() &&
+             IsDependentFunctionNameExpr(ConstraintExpression)) ||
+            Type->isFunctionType() ||
+            Type->isSpecificBuiltinType(BuiltinType::Overload))) ||
+          // We have the following case:
+          // template<typename T> requires size_<T> == 0 struct S { };
+          // The user probably isn't aware of the parentheses required around
+          // the binary operator, and we're only going to parse 'func' as the
+          // first operand, and complain that it is of non-bool type.
+          getBinOpPrecedence(NextToken.getKind(),
+                             /*GreaterThanIsOperator=*/true,
+                             getLangOpts().CPlusPlus11) > prec::LogicalAnd;
+  };
 
   // An atomic constraint!
-  if (ConstraintExpression->isTypeDependent())
+  if (ConstraintExpression->isTypeDependent()) {
+    CheckForNonPrimary();
     return true;
+  }
 
-  QualType Type = ConstraintExpression->getType();
   if (!Context.hasSameUnqualifiedType(Type, Context.BoolTy)) {
     Diag(ConstraintExpression->getExprLoc(),
          diag::err_non_bool_atomic_constraint) << Type
         << ConstraintExpression->getSourceRange();
+    CheckForNonPrimary();
     return false;
   }
+
+  if (PossibleNonPrimary)
+      *PossibleNonPrimary = false;
   return true;
 }
 
@@ -417,123 +456,25 @@ void Sema::DiagnoseUnsatisfiedConstraint(
   }
 }
 
-namespace {
-struct AtomicConstraint {
-  const Expr *ConstraintExpr;
-  llvm::Optional<llvm::SmallVector<TemplateArgumentLoc, 3>> ParameterMapping;
-
-  AtomicConstraint(Sema &S, const Expr *ConstraintExpr) :
-      ConstraintExpr(ConstraintExpr) { };
-
-  bool hasMatchingParameterMapping(ASTContext &C,
-                                   const AtomicConstraint &Other) const {
-    if (!ParameterMapping != !Other.ParameterMapping)
-      return false;
-    if (!ParameterMapping)
-      return true;
-    if (ParameterMapping->size() != Other.ParameterMapping->size())
-      return false;
-
-    for (unsigned I = 0, S = ParameterMapping->size(); I < S; ++I)
-      if (!C.getCanonicalTemplateArgument((*ParameterMapping)[I].getArgument())
-               .structurallyEquals(C.getCanonicalTemplateArgument(
-                  (*Other.ParameterMapping)[I].getArgument())))
-        return false;
-    return true;
-  }
-
-  bool subsumes(ASTContext &C, const AtomicConstraint &Other) const {
-    // C++ [temp.constr.order] p2
-    //   - an atomic constraint A subsumes another atomic constraint B
-    //     if and only if the A and B are identical [...]
-    //
-    // C++ [temp.constr.atomic] p2
-    //   Two atomic constraints are identical if they are formed from the
-    //   same expression and the targets of the parameter mappings are
-    //   equivalent according to the rules for expressions [...]
-
-    // We do not actually substitute the parameter mappings into the
-    // constraint expressions, therefore the constraint expressions are
-    // the originals, and comparing them will suffice.
-    if (ConstraintExpr != Other.ConstraintExpr)
-      return false;
-
-    // Check that the parameter lists are identical
-    return hasMatchingParameterMapping(C, Other);
-  }
-};
-
-/// \brief A normalized constraint, as defined in C++ [temp.constr.normal], is
-/// either an atomic constraint, a conjunction of normalized constraints or a
-/// disjunction of normalized constraints.
-struct NormalizedConstraint {
-  enum CompoundConstraintKind { CCK_Conjunction, CCK_Disjunction };
-
-  using CompoundConstraint = llvm::PointerIntPair<
-      std::pair<NormalizedConstraint, NormalizedConstraint> *, 1,
-      CompoundConstraintKind>;
-
-  llvm::PointerUnion<AtomicConstraint *, CompoundConstraint> Constraint;
-
-  NormalizedConstraint(AtomicConstraint *C): Constraint{C} { };
-  NormalizedConstraint(ASTContext &C, NormalizedConstraint LHS,
-                       NormalizedConstraint RHS, CompoundConstraintKind Kind)
-      : Constraint{CompoundConstraint{
-            new (C) std::pair<NormalizedConstraint, NormalizedConstraint>{LHS,
-                                                                          RHS},
-            Kind}} { };
-
-  CompoundConstraintKind getCompoundKind() const {
-    assert(!isAtomic() && "getCompoundKind called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getInt();
-  }
-
-  bool isAtomic() const { return Constraint.is<AtomicConstraint *>(); }
-
-  NormalizedConstraint &getLHS() const {
-    assert(!isAtomic() && "getLHS called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getPointer()->first;
-  }
-
-  NormalizedConstraint &getRHS() const {
-    assert(!isAtomic() && "getRHS called on atomic constraint.");
-    return Constraint.get<CompoundConstraint>().getPointer()->second;
+const NormalizedConstraint *
+Sema::getNormalizedAssociatedConstraints(
+    NamedDecl *ConstrainedDecl, ArrayRef<const Expr *> AssociatedConstraints) {
+  auto CacheEntry = NormalizationCache.find(ConstrainedDecl);
+  if (CacheEntry == NormalizationCache.end()) {
+    auto Normalized =
+        NormalizedConstraint::fromConstraintExprs(*this, ConstrainedDecl,
+                                                  AssociatedConstraints);
+    CacheEntry =
+        NormalizationCache
+            .try_emplace(ConstrainedDecl,
+                         Normalized
+                             ? new (Context) NormalizedConstraint(
+                                 std::move(*Normalized))
+                             : nullptr)
+            .first;
   }
-
-  AtomicConstraint *getAtomicConstraint() const {
-    assert(isAtomic() &&
-           "getAtomicConstraint called on non-atomic constraint.");
-    return Constraint.get<AtomicConstraint *>();
-  }
-
-  static llvm::Optional<NormalizedConstraint>
-  fromConstraintExprs(Sema &S, NamedDecl *D, ArrayRef<const Expr *> E) {
-    assert(E.size() != 0);
-    auto First = fromConstraintExpr(S, D, E[0]);
-    if (E.size() == 1)
-      return First;
-    auto Second = fromConstraintExpr(S, D, E[1]);
-    if (!Second)
-      return llvm::Optional<NormalizedConstraint>{};
-    llvm::Optional<NormalizedConstraint> Conjunction;
-    Conjunction.emplace(S.Context, std::move(*First), std::move(*Second),
-                        CCK_Conjunction);
-    for (unsigned I = 2; I < E.size(); ++I) {
-      auto Next = fromConstraintExpr(S, D, E[I]);
-      if (!Next)
-        return llvm::Optional<NormalizedConstraint>{};
-      NormalizedConstraint NewConjunction(S.Context, std::move(*Conjunction),
-                                          std::move(*Next), CCK_Conjunction);
-      *Conjunction = std::move(NewConjunction);
-    }
-    return Conjunction;
-  }
-
-private:
-  static llvm::Optional<NormalizedConstraint> fromConstraintExpr(Sema &S,
-                                                                 NamedDecl *D,
-                                                                 const Expr *E);
-};
+  return CacheEntry->second;
+}
 
 static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
     ConceptDecl *Concept, ArrayRef<TemplateArgument> TemplateArgs,
@@ -555,11 +496,13 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
     llvm::SmallBitVector OccurringIndices(TemplateParams->size());
     S.MarkUsedTemplateParameters(Atomic.ConstraintExpr, /*OnlyDeduced=*/false,
                                  /*Depth=*/0, OccurringIndices);
-    Atomic.ParameterMapping.emplace();
-    Atomic.ParameterMapping->reserve(OccurringIndices.size());
-    for (unsigned I = 0, C = TemplateParams->size(); I != C; ++I)
+    Atomic.ParameterMapping.emplace(
+        MutableArrayRef<TemplateArgumentLoc>(
+            new (S.Context) TemplateArgumentLoc[OccurringIndices.count()],
+            OccurringIndices.count()));
+    for (unsigned I = 0, J = 0, C = TemplateParams->size(); I != C; ++I)
       if (OccurringIndices[I])
-        Atomic.ParameterMapping->push_back(
+        new (&(*Atomic.ParameterMapping)[J++]) TemplateArgumentLoc(
             S.getIdentityTemplateArgumentLoc(TemplateParams->begin()[I],
                 // Here we assume we do not support things like
                 // template<typename A, typename B>
@@ -585,6 +528,30 @@ static bool substituteParameterMappings(Sema &S, NormalizedConstraint &N,
   return false;
 }
 
+Optional<NormalizedConstraint>
+NormalizedConstraint::fromConstraintExprs(Sema &S, NamedDecl *D,
+                                          ArrayRef<const Expr *> E) {
+  assert(E.size() != 0);
+  auto First = fromConstraintExpr(S, D, E[0]);
+  if (E.size() == 1)
+    return First;
+  auto Second = fromConstraintExpr(S, D, E[1]);
+  if (!Second)
+    return None;
+  llvm::Optional<NormalizedConstraint> Conjunction;
+  Conjunction.emplace(S.Context, std::move(*First), std::move(*Second),
+                      CCK_Conjunction);
+  for (unsigned I = 2; I < E.size(); ++I) {
+    auto Next = fromConstraintExpr(S, D, E[I]);
+    if (!Next)
+      return llvm::Optional<NormalizedConstraint>{};
+    NormalizedConstraint NewConjunction(S.Context, std::move(*Conjunction),
+                                        std::move(*Next), CCK_Conjunction);
+    *Conjunction = std::move(NewConjunction);
+  }
+  return Conjunction;
+}
+
 llvm::Optional<NormalizedConstraint>
 NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
   assert(E != nullptr);
@@ -604,11 +571,11 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
         return None;
 
       return NormalizedConstraint(
-          S.Context, *LHS, *RHS,
+          S.Context, std::move(*LHS), std::move(*RHS),
           BO->getOpcode() == BO_LAnd ? CCK_Conjunction : CCK_Disjunction);
     }
   } else if (auto *CSE = dyn_cast<const ConceptSpecializationExpr>(E)) {
-    Optional<NormalizedConstraint> SubNF;
+    const NormalizedConstraint *SubNF;
     {
       Sema::InstantiatingTemplate Inst(
           S, CSE->getExprLoc(),
@@ -623,24 +590,26 @@ NormalizedConstraint::fromConstraintExpr(Sema &S, NamedDecl *D, const Expr *E) {
       // constraint. If any such substitution results in an invalid type or
       // expression, the program is ill-formed; no diagnostic is required.
       // [...]
-      SubNF = fromConstraintExpr(S, CSE->getNamedConcept(),
-                                 CSE->getNamedConcept()->getConstraintExpr());
+      ConceptDecl *CD = CSE->getNamedConcept();
+      SubNF = S.getNormalizedAssociatedConstraints(CD,
+                                                   {CD->getConstraintExpr()});
       if (!SubNF)
         return None;
     }
 
+    Optional<NormalizedConstraint> New;
+    New.emplace(S.Context, *SubNF);
+
     if (substituteParameterMappings(
-            S, *SubNF, CSE->getNamedConcept(),
+            S, *New, CSE->getNamedConcept(),
             CSE->getTemplateArguments(), CSE->getTemplateArgsAsWritten()))
       return None;
 
-    return SubNF;
+    return New;
   }
   return NormalizedConstraint{new (S.Context) AtomicConstraint(S, E)};
 }
 
-} // namespace
-
 using NormalForm =
     llvm::SmallVector<llvm::SmallVector<AtomicConstraint *, 2>, 4>;
 
@@ -703,22 +672,9 @@ static NormalForm makeDNF(const NormalizedConstraint &Normalized) {
   return Res;
 }
 
-static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
-                     NamedDecl *DQ, ArrayRef<const Expr *> Q, bool &Subsumes) {
-  // C++ [temp.constr.order] p2
-  //   In order to determine if a constraint P subsumes a constraint Q, P is
-  //   transformed into disjunctive normal form, and Q is transformed into
-  //   conjunctive normal form. [...]
-  auto PNormalized = NormalizedConstraint::fromConstraintExprs(S, DP, P);
-  if (!PNormalized)
-    return true;
-  const NormalForm PDNF = makeDNF(*PNormalized);
-
-  auto QNormalized = NormalizedConstraint::fromConstraintExprs(S, DQ, Q);
-  if (!QNormalized)
-    return true;
-  const NormalForm QCNF = makeCNF(*QNormalized);
-
+template<typename AtomicSubsumptionEvaluator>
+static bool subsumes(NormalForm PDNF, NormalForm QCNF,
+                     AtomicSubsumptionEvaluator E) {
   // C++ [temp.constr.order] p2
   //   Then, P subsumes Q if and only if, for every disjunctive clause Pi in the
   //   disjunctive normal form of P, Pi subsumes every conjunctive clause Qj in
@@ -733,7 +689,7 @@ static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
       bool Found = false;
       for (const AtomicConstraint *Pia : Pi) {
         for (const AtomicConstraint *Qjb : Qj) {
-          if (Pia->subsumes(S.Context, *Qjb)) {
+          if (E(*Pia, *Qjb)) {
             Found = true;
             break;
           }
@@ -741,13 +697,32 @@ static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
         if (Found)
           break;
       }
-      if (!Found) {
-        Subsumes = false;
+      if (!Found)
         return false;
-      }
     }
   }
-  Subsumes = true;
+  return true;
+}
+
+template<typename AtomicSubsumptionEvaluator>
+static bool subsumes(Sema &S, NamedDecl *DP, ArrayRef<const Expr *> P,
+                     NamedDecl *DQ, ArrayRef<const Expr *> Q, bool &Subsumes,
+                     AtomicSubsumptionEvaluator E) {
+  // C++ [temp.constr.order] p2
+  //   In order to determine if a constraint P subsumes a constraint Q, P is
+  //   transformed into disjunctive normal form, and Q is transformed into
+  //   conjunctive normal form. [...]
+  auto *PNormalized = S.getNormalizedAssociatedConstraints(DP, P);
+  if (!PNormalized)
+    return true;
+  const NormalForm PDNF = makeDNF(*PNormalized);
+
+  auto *QNormalized = S.getNormalizedAssociatedConstraints(DQ, Q);
+  if (!QNormalized)
+    return true;
+  const NormalForm QCNF = makeCNF(*QNormalized);
+
+  Subsumes = subsumes(PDNF, QCNF, E);
   return false;
 }
 
@@ -770,8 +745,84 @@ bool Sema::IsAtLeastAsConstrained(NamedDecl *D1, ArrayRef<const Expr *> AC1,
     Result = CacheEntry->second;
     return false;
   }
-  if (subsumes(*this, D1, AC1, D2, AC2, Result))
+
+  if (subsumes(*this, D1, AC1, D2, AC2, Result,
+        [this] (const AtomicConstraint &A, const AtomicConstraint &B) {
+          return A.subsumes(Context, B);
+        }))
     return true;
   SubsumptionCache.try_emplace(Key, Result);
   return false;
-}
\ No newline at end of file
+}
+
+bool Sema::MaybeEmitAmbiguousAtomicConstraintsDiagnostic(NamedDecl *D1,
+    ArrayRef<const Expr *> AC1, NamedDecl *D2, ArrayRef<const Expr *> AC2) {
+  if (isSFINAEContext())
+    // No need to work here because our notes would be discarded.
+    return false;
+
+  if (AC1.empty() || AC2.empty())
+    return false;
+
+  auto NormalExprEvaluator =
+      [this] (const AtomicConstraint &A, const AtomicConstraint &B) {
+        return A.subsumes(Context, B);
+      };
+
+  const Expr *AmbiguousAtomic1 = nullptr, *AmbiguousAtomic2 = nullptr;
+  auto IdenticalExprEvaluator =
+      [&] (const AtomicConstraint &A, const AtomicConstraint &B) {
+        if (!A.hasMatchingParameterMapping(Context, B))
+          return false;
+        const Expr *EA = A.ConstraintExpr, *EB = B.ConstraintExpr;
+        if (EA == EB)
+          return true;
+
+        // Not the same source level expression - are the expressions
+        // identical?
+        llvm::FoldingSetNodeID IDA, IDB;
+        EA->Profile(IDA, Context, /*Cannonical=*/true);
+        EB->Profile(IDB, Context, /*Cannonical=*/true);
+        if (IDA != IDB)
+          return false;
+
+        AmbiguousAtomic1 = EA;
+        AmbiguousAtomic2 = EB;
+        return true;
+      };
+
+  {
+    // The subsumption checks might cause diagnostics
+    SFINAETrap Trap(*this);
+    auto *Normalized1 = getNormalizedAssociatedConstraints(D1, AC1);
+    if (!Normalized1)
+      return false;
+    const NormalForm DNF1 = makeDNF(*Normalized1);
+    const NormalForm CNF1 = makeCNF(*Normalized1);
+
+    auto *Normalized2 = getNormalizedAssociatedConstraints(D2, AC2);
+    if (!Normalized2)
+      return false;
+    const NormalForm DNF2 = makeDNF(*Normalized2);
+    const NormalForm CNF2 = makeCNF(*Normalized2);
+
+    bool Is1AtLeastAs2Normally = subsumes(DNF1, CNF2, NormalExprEvaluator);
+    bool Is2AtLeastAs1Normally = subsumes(DNF2, CNF1, NormalExprEvaluator);
+    bool Is1AtLeastAs2 = subsumes(DNF1, CNF2, IdenticalExprEvaluator);
+    bool Is2AtLeastAs1 = subsumes(DNF2, CNF1, IdenticalExprEvaluator);
+    if (Is1AtLeastAs2 == Is1AtLeastAs2Normally &&
+        Is2AtLeastAs1 == Is2AtLeastAs1Normally)
+      // Same result - no ambiguity was caused by identical atomic expressions.
+      return false;
+  }
+
+  // A different result! Some ambiguous atomic constraint(s) caused a difference
+  assert(AmbiguousAtomic1 && AmbiguousAtomic2);
+
+  Diag(AmbiguousAtomic1->getBeginLoc(), diag::note_ambiguous_atomic_constraints)
+      << AmbiguousAtomic1->getSourceRange();
+  Diag(AmbiguousAtomic2->getBeginLoc(),
+       diag::note_ambiguous_atomic_constraints_similar_expression)
+      << AmbiguousAtomic2->getSourceRange();
+  return true;
+}
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 92b115c8a3f3c..468db9cfc7f84 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7895,7 +7895,13 @@ struct FindOverriddenMethod {
          Path.Decls = Path.Decls.slice(1)) {
       NamedDecl *D = Path.Decls.front();
       if (CXXMethodDecl *MD = dyn_cast<CXXMethodDecl>(D)) {
-        if (MD->isVirtual() && !S->IsOverload(Method, MD, false))
+        if (MD->isVirtual() &&
+            !S->IsOverload(
+                Method, MD, /*UseMemberUsingDeclRules=*/false,
+                /*ConsiderCudaAttrs=*/true,
+                // C++2a [class.virtual]p2 does not consider requires clauses
+                // when overriding.
+                /*ConsiderRequiresClauses=*/false))
           return true;
       }
     }
@@ -8240,7 +8246,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
 
     NewFD = FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(), NameInfo,
                                  R, TInfo, SC, isInline, HasPrototype,
-                                 CSK_unspecified);
+                                 CSK_unspecified,
+                                 /*TrailingRequiresClause=*/nullptr);
     if (D.isInvalidType())
       NewFD->setInvalidDecl();
 
@@ -8257,6 +8264,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     ConstexprKind = CSK_unspecified;
     D.getMutableDeclSpec().ClearConstexprSpec();
   }
+  Expr *TrailingRequiresClause = D.getTrailingRequiresClause();
 
   // Check that the return type is not an abstract class type.
   // For record types, this is done by the AbstractClassUsageDiagnoser once
@@ -8276,7 +8284,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     return CXXConstructorDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
         TInfo, ExplicitSpecifier, isInline,
-        /*isImplicitlyDeclared=*/false, ConstexprKind);
+        /*isImplicitlyDeclared=*/false, ConstexprKind, InheritedConstructor(),
+        TrailingRequiresClause);
 
   } else if (Name.getNameKind() == DeclarationName::CXXDestructorName) {
     // This is a C++ destructor declaration.
@@ -8285,8 +8294,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
       CXXRecordDecl *Record = cast<CXXRecordDecl>(DC);
       CXXDestructorDecl *NewDD = CXXDestructorDecl::Create(
           SemaRef.Context, Record, D.getBeginLoc(), NameInfo, R, TInfo,
-          isInline,
-          /*isImplicitlyDeclared=*/false, ConstexprKind);
+          isInline, /*isImplicitlyDeclared=*/false, ConstexprKind,
+          TrailingRequiresClause);
 
       // If the destructor needs an implicit exception specification, set it
       // now. FIXME: It'd be nice to be able to create the right type to start
@@ -8306,7 +8315,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
       return FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(),
                                   D.getIdentifierLoc(), Name, R, TInfo, SC,
                                   isInline,
-                                  /*hasPrototype=*/true, ConstexprKind);
+                                  /*hasPrototype=*/true, ConstexprKind,
+                                  TrailingRequiresClause);
     }
 
   } else if (Name.getNameKind() == DeclarationName::CXXConversionFunctionName) {
@@ -8323,9 +8333,14 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     IsVirtualOkay = true;
     return CXXConversionDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
-        TInfo, isInline, ExplicitSpecifier, ConstexprKind, SourceLocation());
+        TInfo, isInline, ExplicitSpecifier, ConstexprKind, SourceLocation(),
+        TrailingRequiresClause);
 
   } else if (Name.getNameKind() == DeclarationName::CXXDeductionGuideName) {
+    if (TrailingRequiresClause)
+      SemaRef.Diag(TrailingRequiresClause->getBeginLoc(),
+                   diag::err_trailing_requires_clause_on_deduction_guide)
+          << TrailingRequiresClause->getSourceRange();
     SemaRef.CheckDeductionGuideDeclarator(D, R, SC);
 
     return CXXDeductionGuideDecl::Create(SemaRef.Context, DC, D.getBeginLoc(),
@@ -8347,7 +8362,8 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     // This is a C++ method declaration.
     CXXMethodDecl *Ret = CXXMethodDecl::Create(
         SemaRef.Context, cast<CXXRecordDecl>(DC), D.getBeginLoc(), NameInfo, R,
-        TInfo, SC, isInline, ConstexprKind, SourceLocation());
+        TInfo, SC, isInline, ConstexprKind, SourceLocation(),
+        TrailingRequiresClause);
     IsVirtualOkay = !Ret->isStatic();
     return Ret;
   } else {
@@ -8361,7 +8377,7 @@ static FunctionDecl *CreateNewFunctionDecl(Sema &SemaRef, Declarator &D,
     //   - we're in C++ (where every function has a prototype),
     return FunctionDecl::Create(SemaRef.Context, DC, D.getBeginLoc(), NameInfo,
                                 R, TInfo, SC, isInline, true /*HasPrototype*/,
-                                ConstexprKind);
+                                ConstexprKind, TrailingRequiresClause);
   }
 }
 
@@ -10572,6 +10588,11 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
           }
         }
       }
+      if (Method->isVirtual() && NewFD->getTrailingRequiresClause())
+        // C++2a [class.virtual]p6
+        // A virtual method shall not have a requires-clause.
+        Diag(NewFD->getTrailingRequiresClause()->getBeginLoc(),
+             diag::err_constrained_virtual_method);
 
       if (Method->isStatic())
         checkThisInStaticMemberFunctionType(Method);
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 02aebbea4a8bd..7f1da406757dd 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -7569,7 +7569,8 @@ NamedDecl * Sema::DeclClonePragmaWeak(NamedDecl *ND, IdentifierInfo *II,
     NewFD = FunctionDecl::Create(
         FD->getASTContext(), FD->getDeclContext(), Loc, Loc,
         DeclarationName(II), FD->getType(), FD->getTypeSourceInfo(), SC_None,
-        false /*isInlineSpecified*/, FD->hasPrototype(), CSK_unspecified);
+        false /*isInlineSpecified*/, FD->hasPrototype(), CSK_unspecified,
+        FD->getTrailingRequiresClause());
     NewD = NewFD;
 
     if (FD->getQualifier())
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 5dba1c1d2c121..497e45631be4a 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -3868,6 +3868,26 @@ void Sema::ActOnStartCXXInClassMemberInitializer() {
   PushFunctionScope();
 }
 
+void Sema::ActOnStartTrailingRequiresClause(Scope *S, Declarator &D) {
+  if (!D.isFunctionDeclarator())
+    return;
+  auto &FTI = D.getFunctionTypeInfo();
+  if (!FTI.Params)
+    return;
+  for (auto &Param : ArrayRef<DeclaratorChunk::ParamInfo>(FTI.Params,
+                                                          FTI.NumParams)) {
+    auto *ParamDecl = cast<NamedDecl>(Param.Param);
+    if (ParamDecl->getDeclName())
+      PushOnScopeChains(ParamDecl, S, /*AddToContext=*/false);
+  }
+}
+
+ExprResult Sema::ActOnFinishTrailingRequiresClause(ExprResult ConstraintExpr) {
+  if (ConstraintExpr.isInvalid())
+    return ExprError();
+  return CorrectDelayedTyposInExpr(ConstraintExpr);
+}
+
 /// This is invoked after parsing an in-class initializer for a
 /// non-static C++ class member, and after instantiating an in-class initializer
 /// in a class template. Such actions are deferred until the class is complete.
@@ -12702,7 +12722,8 @@ Sema::findInheritingConstructor(SourceLocation Loc,
       BaseCtor->getExplicitSpecifier(), /*isInline=*/true,
       /*isImplicitlyDeclared=*/true,
       Constexpr ? BaseCtor->getConstexprKind() : CSK_unspecified,
-      InheritedConstructor(Shadow, BaseCtor));
+      InheritedConstructor(Shadow, BaseCtor),
+      BaseCtor->getTrailingRequiresClause());
   if (Shadow->isInvalidDecl())
     DerivedCtor->setInvalidDecl();
 
@@ -17092,6 +17113,11 @@ bool Sema::checkThisInStaticMemberFunctionType(CXXMethodDecl *Method) {
   if (checkThisInStaticMemberFunctionExceptionSpec(Method))
     return true;
 
+  // Check the trailing requires clause
+  if (Expr *E = Method->getTrailingRequiresClause())
+    if (!Finder.TraverseStmt(E))
+      return true;
+
   return checkThisInStaticMemberFunctionAttributes(Method);
 }
 
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 61cd629646f60..6c1d3deab2a3f 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -326,6 +326,30 @@ bool Sema::DiagnoseUseOfDecl(NamedDecl *D, ArrayRef<SourceLocation> Locs,
 
   diagnoseUseOfInternalDeclInInlineFunction(*this, D, Loc);
 
+  // [expr.prim.id]p4
+  //   A program that refers explicitly or implicitly to a function with a
+  //   trailing requires-clause whose constraint-expression is not satisfied,
+  //   other than to declare it, is ill-formed. [...]
+  //
+  // See if this is a function with constraints that need to be satisfied.
+  if (FunctionDecl *FD = dyn_cast<FunctionDecl>(D)) {
+    if (Expr *RC = FD->getTrailingRequiresClause()) {
+      ConstraintSatisfaction Satisfaction;
+      bool Failed = CheckConstraintSatisfaction(RC, Satisfaction);
+      if (Failed)
+        // A diagnostic will have already been generated (non-constant
+        // constraint expression, for example)
+        return true;
+      if (!Satisfaction.IsSatisfied) {
+        Diag(Loc,
+             diag::err_reference_to_function_with_unsatisfied_constraints)
+            << D;
+        DiagnoseUnsatisfiedConstraint(Satisfaction);
+        return true;
+      }
+    }
+  }
+
   return false;
 }
 
@@ -17975,7 +17999,7 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
     // No guarantees that ResolveAndFixSingleFunctionTemplateSpecialization
     // leaves Result unchanged on failure.
     Result = E;
-    if (resolveAndFixAddressOfOnlyViableOverloadCandidate(Result))
+    if (resolveAndFixAddressOfSingleOverloadCandidate(Result))
       return Result;
 
     // If that failed, try to recover with a call.
@@ -18112,3 +18136,8 @@ ExprResult Sema::ActOnObjCAvailabilityCheckExpr(
   return new (Context)
       ObjCAvailabilityCheckExpr(Version, AtLoc, RParen, Context.BoolTy);
 }
+
+bool Sema::IsDependentFunctionNameExpr(Expr *E) {
+  assert(E->isTypeDependent());
+  return isa<UnresolvedLookupExpr>(E);
+}
diff --git a/clang/lib/Sema/SemaLambda.cpp b/clang/lib/Sema/SemaLambda.cpp
index d09a3377d2b03..c2d14a44f53d4 100644
--- a/clang/lib/Sema/SemaLambda.cpp
+++ b/clang/lib/Sema/SemaLambda.cpp
@@ -361,7 +361,8 @@ CXXMethodDecl *Sema::startLambdaDefinition(CXXRecordDecl *Class,
                                            TypeSourceInfo *MethodTypeInfo,
                                            SourceLocation EndLoc,
                                            ArrayRef<ParmVarDecl *> Params,
-                                           ConstexprSpecKind ConstexprKind) {
+                                           ConstexprSpecKind ConstexprKind,
+                                           Expr *TrailingRequiresClause) {
   QualType MethodType = MethodTypeInfo->getType();
   TemplateParameterList *TemplateParams =
       getGenericLambdaTemplateParameterList(getCurLambda(), *this);
@@ -395,7 +396,7 @@ CXXMethodDecl *Sema::startLambdaDefinition(CXXRecordDecl *Class,
       DeclarationNameInfo(MethodName, IntroducerRange.getBegin(),
                           MethodNameLoc),
       MethodType, MethodTypeInfo, SC_None,
-      /*isInline=*/true, ConstexprKind, EndLoc);
+      /*isInline=*/true, ConstexprKind, EndLoc, TrailingRequiresClause);
   Method->setAccess(AS_public);
   if (!TemplateParams)
     Class->addDecl(Method);
@@ -972,7 +973,8 @@ void Sema::ActOnStartOfLambdaDefinition(LambdaIntroducer &Intro,
                                                  KnownDependent, Intro.Default);
   CXXMethodDecl *Method =
       startLambdaDefinition(Class, Intro.Range, MethodTyInfo, EndLoc, Params,
-                            ParamInfo.getDeclSpec().getConstexprSpecifier());
+                            ParamInfo.getDeclSpec().getConstexprSpecifier(),
+                            ParamInfo.getTrailingRequiresClause());
   if (ExplicitParams)
     CheckCXXDefaultArguments(Method);
 
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 83b7f497f99d9..fa811ee2bd25a 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1137,7 +1137,8 @@ Sema::CheckOverload(Scope *S, FunctionDecl *New, const LookupResult &Old,
 }
 
 bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
-                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs) {
+                      bool UseMemberUsingDeclRules, bool ConsiderCudaAttrs,
+                      bool ConsiderRequiresClauses) {
   // C++ [basic.start.main]p2: This function shall not be overloaded.
   if (New->isMain())
     return false;
@@ -1273,23 +1274,38 @@ bool Sema::IsOverload(FunctionDecl *New, FunctionDecl *Old,
   if (getLangOpts().CUDA && ConsiderCudaAttrs) {
     // Don't allow overloading of destructors.  (In theory we could, but it
     // would be a giant change to clang.)
-    if (isa<CXXDestructorDecl>(New))
-      return false;
-
-    CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
-                       OldTarget = IdentifyCUDATarget(Old);
-    if (NewTarget == CFT_InvalidTarget)
-      return false;
+    if (!isa<CXXDestructorDecl>(New)) {
+      CUDAFunctionTarget NewTarget = IdentifyCUDATarget(New),
+                         OldTarget = IdentifyCUDATarget(Old);
+      if (NewTarget != CFT_InvalidTarget) {
+        assert((OldTarget != CFT_InvalidTarget) &&
+               "Unexpected invalid target.");
+
+        // Allow overloading of functions with same signature and different CUDA
+        // target attributes.
+        if (NewTarget != OldTarget)
+          return true;
+      }
+    }
+  }
 
-    assert((OldTarget != CFT_InvalidTarget) && "Unexpected invalid target.");
+  if (ConsiderRequiresClauses) {
+    Expr *NewRC = New->getTrailingRequiresClause(),
+         *OldRC = Old->getTrailingRequiresClause();
+    if ((NewRC != nullptr) != (OldRC != nullptr))
+      // RC are most certainly different - these are overloads.
+      return true;
 
-    // Allow overloading of functions with same signature and different CUDA
-    // target attributes.
-    return NewTarget != OldTarget;
+    if (NewRC) {
+      llvm::FoldingSetNodeID NewID, OldID;
+      NewRC->Profile(NewID, Context, /*Canonical=*/true);
+      OldRC->Profile(OldID, Context, /*Canonical=*/true);
+      if (NewID != OldID)
+        // RCs are not equivalent - these are overloads.
+        return true;
+    }
   }
 
-  // TODO: Concepts: Check function trailing requires clauses here.
-
   // The signatures match; this is not an overload.
   return false;
 }
@@ -6258,6 +6274,16 @@ void Sema::AddOverloadCandidate(
         return;
       }
 
+  if (Expr *RequiresClause = Function->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
@@ -6774,6 +6800,16 @@ Sema::AddMethodCandidate(CXXMethodDecl *Method, DeclAccessPair FoundDecl,
         return;
       }
 
+  if (Expr *RequiresClause = Method->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // Determine the implicit conversion sequences for each of the
   // arguments.
   for (unsigned ArgIdx = 0; ArgIdx < Args.size(); ++ArgIdx) {
@@ -7130,6 +7166,17 @@ void Sema::AddConversionCandidate(
     return;
   }
 
+  Expr *RequiresClause = Conversion->getTrailingRequiresClause();
+  if (RequiresClause) {
+    ConstraintSatisfaction Satisfaction;
+    if (CheckConstraintSatisfaction(RequiresClause, Satisfaction) ||
+        !Satisfaction.IsSatisfied) {
+      Candidate.Viable = false;
+      Candidate.FailureKind = ovl_fail_constraints_not_satisfied;
+      return;
+    }
+  }
+
   // We won't go through a user-defined type conversion function to convert a
   // derived to base as such conversions are given Conversion Rank. They only
   // go through a copy constructor. 13.3.3.1.2-p4 [over.ics.user]
@@ -9461,6 +9508,35 @@ bool clang::isBetterOverloadCandidate(
       return BetterTemplate == Cand1.Function->getPrimaryTemplate();
   }
 
+  //   -— F1 and F2 are non-template functions with the same
+  //      parameter-type-lists, and F1 is more constrained than F2 [...],
+  if (Cand1.Function && Cand2.Function && !Cand1IsSpecialization &&
+      !Cand2IsSpecialization && Cand1.Function->hasPrototype() &&
+      Cand2.Function->hasPrototype()) {
+    auto *PT1 = cast<FunctionProtoType>(Cand1.Function->getFunctionType());
+    auto *PT2 = cast<FunctionProtoType>(Cand2.Function->getFunctionType());
+    if (PT1->getNumParams() == PT2->getNumParams() &&
+        PT1->isVariadic() == PT2->isVariadic() &&
+        S.FunctionParamTypesAreEqual(PT1, PT2)) {
+      Expr *RC1 = Cand1.Function->getTrailingRequiresClause();
+      Expr *RC2 = Cand2.Function->getTrailingRequiresClause();
+      if (RC1 && RC2) {
+        bool AtLeastAsConstrained1, AtLeastAsConstrained2;
+        if (S.IsAtLeastAsConstrained(Cand1.Function, {RC1}, Cand2.Function,
+                                     {RC2}, AtLeastAsConstrained1))
+          return false;
+        if (!AtLeastAsConstrained1)
+          return false;
+        if (S.IsAtLeastAsConstrained(Cand2.Function, {RC2}, Cand1.Function,
+                                     {RC1}, AtLeastAsConstrained2))
+          return false;
+        if (!AtLeastAsConstrained2)
+          return true;
+      } else if (RC1 || RC2)
+        return RC1 != nullptr;
+    }
+  }
+
   //   -- F1 is a constructor for a class D, F2 is a constructor for a base
   //      class B of D, and for all arguments the corresponding parameters of
   //      F1 and F2 have the same type.
@@ -9829,6 +9905,24 @@ static bool checkAddressOfFunctionIsAvailable(Sema &S, const FunctionDecl *FD,
     return false;
   }
 
+  if (const Expr *RC = FD->getTrailingRequiresClause()) {
+    ConstraintSatisfaction Satisfaction;
+    if (S.CheckConstraintSatisfaction(RC, Satisfaction))
+      return false;
+    if (!Satisfaction.IsSatisfied) {
+      if (Complain) {
+        if (InOverloadResolution)
+          S.Diag(FD->getBeginLoc(),
+                 diag::note_ovl_candidate_unsatisfied_constraints);
+        else
+          S.Diag(Loc, diag::err_addrof_function_constraints_not_satisfied)
+              << FD;
+        S.DiagnoseUnsatisfiedConstraint(Satisfaction);
+      }
+      return false;
+    }
+  }
+
   auto I = llvm::find_if(FD->parameters(), [](const ParmVarDecl *P) {
     return P->hasAttr<PassObjectSizeAttr>();
   });
@@ -9886,6 +9980,55 @@ void Sema::NoteOverloadCandidate(NamedDecl *Found, FunctionDecl *Fn,
   MaybeEmitInheritedConstructorNote(*this, Found);
 }
 
+static void
+MaybeDiagnoseAmbiguousConstraints(Sema &S, ArrayRef<OverloadCandidate> Cands) {
+  // Perhaps the ambiguity was caused by two atomic constraints that are
+  // 'identical' but not equivalent:
+  //
+  // void foo() requires (sizeof(T) > 4) { } // #1
+  // void foo() requires (sizeof(T) > 4) && T::value { } // #2
+  //
+  // The 'sizeof(T) > 4' constraints are seemingly equivalent and should cause
+  // #2 to subsume #1, but these constraint are not considered equivalent
+  // according to the subsumption rules because they are not the same
+  // source-level construct. This behavior is quite confusing and we should try
+  // to help the user figure out what happened.
+
+  SmallVector<const Expr *, 3> FirstAC, SecondAC;
+  FunctionDecl *FirstCand = nullptr, *SecondCand = nullptr;
+  for (auto I = Cands.begin(), E = Cands.end(); I != E; ++I) {
+    if (!I->Function)
+      continue;
+    SmallVector<const Expr *, 3> AC;
+    if (auto *Template = I->Function->getPrimaryTemplate())
+      Template->getAssociatedConstraints(AC);
+    else
+      I->Function->getAssociatedConstraints(AC);
+    if (AC.empty())
+      continue;
+    if (FirstCand == nullptr) {
+      FirstCand = I->Function;
+      FirstAC = AC;
+    } else if (SecondCand == nullptr) {
+      SecondCand = I->Function;
+      SecondAC = AC;
+    } else {
+      // We have more than one pair of constrained functions - this check is
+      // expensive and we'd rather not try to diagnose it.
+      return;
+    }
+  }
+  if (!SecondCand)
+    return;
+  // The diagnostic can only happen if there are associated constraints on
+  // both sides (there needs to be some identical atomic constraint).
+  if (S.MaybeEmitAmbiguousAtomicConstraintsDiagnostic(FirstCand, FirstAC,
+                                                      SecondCand, SecondAC))
+    // Just show the user one diagnostic, they'll probably figure it out
+    // from here.
+    return;
+}
+
 // Notes the location of all overload candidates designated through
 // OverloadedExpr
 void Sema::NoteAllOverloadCandidates(Expr *OverloadedExpr, QualType DestType,
@@ -10771,6 +10914,23 @@ static void NoteFunctionCandidate(Sema &S, OverloadCandidate *Cand,
   case ovl_non_default_multiversion_function:
     // Do nothing, these should simply be ignored.
     break;
+
+  case ovl_fail_constraints_not_satisfied: {
+    std::string FnDesc;
+    std::pair<OverloadCandidateKind, OverloadCandidateSelect> FnKindPair =
+        ClassifyOverloadCandidate(S, Cand->FoundDecl, Fn,
+                                  Cand->getRewriteKind(), FnDesc);
+
+    S.Diag(Fn->getLocation(),
+           diag::note_ovl_candidate_constraints_not_satisfied)
+        << (unsigned)FnKindPair.first << (unsigned)ocs_non_template
+        << FnDesc /* Ignored */;
+    ConstraintSatisfaction Satisfaction;
+    if (S.CheckConstraintSatisfaction(Fn->getTrailingRequiresClause(),
+                                      Satisfaction))
+      break;
+    S.DiagnoseUnsatisfiedConstraint(Satisfaction);
+  }
   }
 }
 
@@ -11156,6 +11316,9 @@ void OverloadCandidateSet::NoteCandidates(PartialDiagnosticAt PD,
   S.Diag(PD.first, PD.second);
 
   NoteCandidates(S, Args, Cands, Opc, OpLoc);
+
+  if (OCD == OCD_AmbiguousCandidates)
+    MaybeDiagnoseAmbiguousConstraints(S, {begin(), end()});
 }
 
 void OverloadCandidateSet::NoteCandidates(Sema &S, ArrayRef<Expr *> Args,
@@ -11804,15 +11967,33 @@ Sema::ResolveAddressOfOverloadedFunction(Expr *AddressOfExpr,
 /// resolve that function to a single function that can have its address taken.
 /// This will modify `Pair` iff it returns non-null.
 ///
-/// This routine can only realistically succeed if all but one candidates in the
-/// overload set for SrcExpr cannot have their addresses taken.
+/// This routine can only succeed if from all of the candidates in the overload
+/// set for SrcExpr that can have their addresses taken, there is one candidate
+/// that is more constrained than the rest.
 FunctionDecl *
-Sema::resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
-                                                  DeclAccessPair &Pair) {
+Sema::resolveAddressOfSingleOverloadCandidate(Expr *E, DeclAccessPair &Pair) {
   OverloadExpr::FindResult R = OverloadExpr::find(E);
   OverloadExpr *Ovl = R.Expression;
+  bool IsResultAmbiguous = false;
   FunctionDecl *Result = nullptr;
   DeclAccessPair DAP;
+  SmallVector<FunctionDecl *, 2> AmbiguousDecls;
+
+  auto CheckMoreConstrained =
+      [&] (FunctionDecl *FD1, FunctionDecl *FD2) -> Optional<bool> {
+        SmallVector<const Expr *, 1> AC1, AC2;
+        FD1->getAssociatedConstraints(AC1);
+        FD2->getAssociatedConstraints(AC2);
+        bool AtLeastAsConstrained1, AtLeastAsConstrained2;
+        if (IsAtLeastAsConstrained(FD1, AC1, FD2, AC2, AtLeastAsConstrained1))
+          return None;
+        if (IsAtLeastAsConstrained(FD2, AC2, FD1, AC1, AtLeastAsConstrained2))
+          return None;
+        if (AtLeastAsConstrained1 == AtLeastAsConstrained2)
+          return None;
+        return AtLeastAsConstrained1;
+      };
+
   // Don't use the AddressOfResolver because we're specifically looking for
   // cases where we have one overload candidate that lacks
   // enable_if/pass_object_size/...
@@ -11824,32 +12005,54 @@ Sema::resolveAddressOfOnlyViableOverloadCandidate(Expr *E,
     if (!checkAddressOfFunctionIsAvailable(FD))
       continue;
 
-    // We have more than one result; quit.
-    if (Result)
-      return nullptr;
+    // We have more than one result - see if it is more constrained than the
+    // previous one.
+    if (Result) {
+      Optional<bool> MoreConstrainedThanPrevious = CheckMoreConstrained(FD,
+                                                                        Result);
+      if (!MoreConstrainedThanPrevious) {
+        IsResultAmbiguous = true;
+        AmbiguousDecls.push_back(FD);
+        continue;
+      }
+      if (!*MoreConstrainedThanPrevious)
+        continue;
+      // FD is more constrained - replace Result with it.
+    }
+    IsResultAmbiguous = false;
     DAP = I.getPair();
     Result = FD;
   }
 
-  if (Result)
+  if (IsResultAmbiguous)
+    return nullptr;
+
+  if (Result) {
+    SmallVector<const Expr *, 1> ResultAC;
+    // We skipped over some ambiguous declarations which might be ambiguous with
+    // the selected result.
+    for (FunctionDecl *Skipped : AmbiguousDecls)
+      if (!CheckMoreConstrained(Skipped, Result).hasValue())
+        return nullptr;
     Pair = DAP;
+  }
   return Result;
 }
 
 /// Given an overloaded function, tries to turn it into a non-overloaded
-/// function reference using resolveAddressOfOnlyViableOverloadCandidate. This
+/// function reference using resolveAddressOfSingleOverloadCandidate. This
 /// will perform access checks, diagnose the use of the resultant decl, and, if
 /// requested, potentially perform a function-to-pointer decay.
 ///
-/// Returns false if resolveAddressOfOnlyViableOverloadCandidate fails.
+/// Returns false if resolveAddressOfSingleOverloadCandidate fails.
 /// Otherwise, returns true. This may emit diagnostics and return true.
-bool Sema::resolveAndFixAddressOfOnlyViableOverloadCandidate(
+bool Sema::resolveAndFixAddressOfSingleOverloadCandidate(
     ExprResult &SrcExpr, bool DoFunctionPointerConverion) {
   Expr *E = SrcExpr.get();
   assert(E->getType() == Context.OverloadTy && "SrcExpr must be an overload");
 
   DeclAccessPair DAP;
-  FunctionDecl *Found = resolveAddressOfOnlyViableOverloadCandidate(E, DAP);
+  FunctionDecl *Found = resolveAddressOfSingleOverloadCandidate(E, DAP);
   if (!Found || Found->isCPUDispatchMultiVersion() ||
       Found->isCPUSpecificMultiVersion())
     return false;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index ade8a5a6ac148..69aabcd7d6345 100755
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -3750,6 +3750,11 @@ static void checkMoreSpecializedThanPrimary(Sema &S, PartialSpecDecl *Partial) {
   }
 
   S.Diag(Template->getLocation(), diag::note_template_decl_here);
+  SmallVector<const Expr *, 3> PartialAC, TemplateAC;
+  Template->getAssociatedConstraints(TemplateAC);
+  Partial->getAssociatedConstraints(PartialAC);
+  S.MaybeEmitAmbiguousAtomicConstraintsDiagnostic(Partial, PartialAC, Template,
+                                                  TemplateAC);
 }
 
 static void
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 521160d1ad23e..d267ae8572e44 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -3389,11 +3389,6 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
           PartialOverloading))
     return Result;
 
-  if (TemplateDeductionResult Result
-        = CheckDeducedArgumentConstraints(*this, FunctionTemplate, Builder,
-                                          Info))
-    return Result;
-
   // C++ [temp.deduct.call]p10: [DR1391]
   //   If deduction succeeds for all parameters that contain
   //   template-parameters that participate in template argument deduction,
@@ -3439,6 +3434,23 @@ Sema::TemplateDeductionResult Sema::FinishTemplateArgumentDeduction(
     return TDK_SubstitutionFailure;
   }
 
+  // C++2a [temp.deduct]p5
+  //   [...] When all template arguments have been deduced [...] all uses of
+  //   template parameters [...] are replaced with the corresponding deduced
+  //   or default argument values.
+  //   [...] If the function template has associated constraints
+  //   ([temp.constr.decl]), those constraints are checked for satisfaction
+  //   ([temp.constr.constr]). If the constraints are not satisfied, type
+  //   deduction fails.
+  if (CheckInstantiatedFunctionTemplateConstraints(Info.getLocation(),
+          Specialization, Builder, Info.AssociatedConstraintsSatisfaction))
+    return TDK_MiscellaneousDeductionFailure;
+
+  if (!Info.AssociatedConstraintsSatisfaction.IsSatisfied) {
+    Info.reset(TemplateArgumentList::CreateCopy(Context, Builder));
+    return TDK_ConstraintsNotSatisfied;
+  }
+
   if (OriginalCallArgs) {
     // C++ [temp.deduct.call]p4:
     //   In general, the deduction process attempts to find template argument
@@ -3559,7 +3571,7 @@ ResolveOverloadForDeduction(Sema &S, TemplateParameterList *TemplateParams,
 
     DeclAccessPair DAP;
     if (FunctionDecl *Viable =
-            S.resolveAddressOfOnlyViableOverloadCandidate(Arg, DAP))
+            S.resolveAddressOfSingleOverloadCandidate(Arg, DAP))
       return GetTypeOfFunction(S, R, Viable);
 
     return {};
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 770c07d41b163..af41e231134d0 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -150,7 +150,7 @@ Sema::getTemplateInstantiationArgs(NamedDecl *D,
           break;
 
         // If this function is a generic lambda specialization, we are done.
-        if (isGenericLambdaCallOperatorSpecialization(Function))
+        if (isGenericLambdaCallOperatorOrStaticInvokerSpecialization(Function))
           break;
 
       } else if (FunctionTemplateDecl *FunTmpl
diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
index 98e05f099198a..ce94c036eb14d 100755
--- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp
@@ -1845,6 +1845,18 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
       return nullptr;
   }
 
+  // FIXME: Concepts: Do not substitute into constraint expressions
+  Expr *TrailingRequiresClause = D->getTrailingRequiresClause();
+  if (TrailingRequiresClause) {
+    ExprResult SubstRC = SemaRef.SubstExpr(TrailingRequiresClause,
+                                           TemplateArgs);
+    if (SubstRC.isInvalid())
+      return nullptr;
+    TrailingRequiresClause = SubstRC.get();
+    if (!SemaRef.CheckConstraintExpression(TrailingRequiresClause))
+      return nullptr;
+  }
+
   // If we're instantiating a local function declaration, put the result
   // in the enclosing namespace; otherwise we need to find the instantiated
   // context.
@@ -1881,7 +1893,8 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
     Function = FunctionDecl::Create(
         SemaRef.Context, DC, D->getInnerLocStart(), NameInfo, T, TInfo,
         D->getCanonicalDecl()->getStorageClass(), D->isInlineSpecified(),
-        D->hasWrittenPrototype(), D->getConstexprKind());
+        D->hasWrittenPrototype(), D->getConstexprKind(),
+        TrailingRequiresClause);
     Function->setRangeEnd(D->getSourceRange().getEnd());
   }
 
@@ -1908,6 +1921,9 @@ Decl *TemplateDeclInstantiator::VisitFunctionDecl(
       Params[P]->setOwningFunction(Function);
   Function->setParams(Params);
 
+  if (TrailingRequiresClause)
+    Function->setTrailingRequiresClause(TrailingRequiresClause);
+
   if (TemplateParams) {
     // Our resulting instantiation is actually a function template, since we
     // are substituting only the outer template parameters. For example, given
@@ -2167,6 +2183,18 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
       return nullptr;
   }
 
+  // FIXME: Concepts: Do not substitute into constraint expressions
+  Expr *TrailingRequiresClause = D->getTrailingRequiresClause();
+  if (TrailingRequiresClause) {
+    ExprResult SubstRC = SemaRef.SubstExpr(TrailingRequiresClause,
+                                           TemplateArgs);
+    if (SubstRC.isInvalid())
+      return nullptr;
+    TrailingRequiresClause = SubstRC.get();
+    if (!SemaRef.CheckConstraintExpression(TrailingRequiresClause))
+      return nullptr;
+  }
+
   DeclContext *DC = Owner;
   if (isFriend) {
     if (QualifierLoc) {
@@ -2199,23 +2227,27 @@ Decl *TemplateDeclInstantiator::VisitCXXMethodDecl(
     Method = CXXConstructorDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
         InstantiatedExplicitSpecifier, Constructor->isInlineSpecified(), false,
-        Constructor->getConstexprKind());
+        Constructor->getConstexprKind(), InheritedConstructor(),
+        TrailingRequiresClause);
     Method->setRangeEnd(Constructor->getEndLoc());
   } else if (CXXDestructorDecl *Destructor = dyn_cast<CXXDestructorDecl>(D)) {
     Method = CXXDestructorDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
-        Destructor->isInlineSpecified(), false, Destructor->getConstexprKind());
+        Destructor->isInlineSpecified(), false, Destructor->getConstexprKind(),
+        TrailingRequiresClause);
     Method->setRangeEnd(Destructor->getEndLoc());
   } else if (CXXConversionDecl *Conversion = dyn_cast<CXXConversionDecl>(D)) {
     Method = CXXConversionDecl::Create(
         SemaRef.Context, Record, StartLoc, NameInfo, T, TInfo,
         Conversion->isInlineSpecified(), InstantiatedExplicitSpecifier,
-        Conversion->getConstexprKind(), Conversion->getEndLoc());
+        Conversion->getConstexprKind(), Conversion->getEndLoc(),
+        TrailingRequiresClause);
   } else {
     StorageClass SC = D->isStatic() ? SC_Static : SC_None;
     Method = CXXMethodDecl::Create(SemaRef.Context, Record, StartLoc, NameInfo,
                                    T, TInfo, SC, D->isInlineSpecified(),
-                                   D->getConstexprKind(), D->getEndLoc());
+                                   D->getConstexprKind(), D->getEndLoc(),
+                                   TrailingRequiresClause);
   }
 
   if (D->isInlined())
@@ -4117,6 +4149,48 @@ void Sema::InstantiateExceptionSpec(SourceLocation PointOfInstantiation,
                      TemplateArgs);
 }
 
+bool Sema::CheckInstantiatedFunctionTemplateConstraints(
+    SourceLocation PointOfInstantiation, FunctionDecl *Decl,
+    ArrayRef<TemplateArgument> TemplateArgs,
+    ConstraintSatisfaction &Satisfaction) {
+  // In most cases we're not going to have constraints, so check for that first.
+  FunctionTemplateDecl *Template = Decl->getPrimaryTemplate();
+  // Note - code synthesis context for the constraints check is created
+  // inside CheckConstraintsSatisfaction.
+  SmallVector<const Expr *, 3> TemplateAC;
+  Template->getAssociatedConstraints(TemplateAC);
+  if (TemplateAC.empty()) {
+    Satisfaction.IsSatisfied = true;
+    return false;
+  }
+
+  // Enter the scope of this instantiation. We don't use
+  // PushDeclContext because we don't have a scope.
+  Sema::ContextRAII savedContext(*this, Decl);
+  LocalInstantiationScope Scope(*this);
+
+  MultiLevelTemplateArgumentList MLTAL =
+    getTemplateInstantiationArgs(Decl, nullptr, /*RelativeToPrimary*/true);
+
+  // If this is not an explicit specialization - we need to get the instantiated
+  // version of the template arguments and add them to scope for the
+  // substitution.
+  if (Decl->isTemplateInstantiation()) {
+    InstantiatingTemplate Inst(*this, Decl->getPointOfInstantiation(),
+        InstantiatingTemplate::ConstraintsCheck{}, Decl->getPrimaryTemplate(),
+        MLTAL.getInnermost(), SourceRange());
+    if (Inst.isInvalid())
+      return true;
+    if (addInstantiatedParametersToScope(*this, Decl,
+                                        Decl->getTemplateInstantiationPattern(),
+                                         Scope, MLTAL))
+      return true;
+  }
+
+  return CheckConstraintSatisfaction(Template, TemplateAC, TemplateArgs,
+                                     PointOfInstantiation, Satisfaction);
+}
+
 /// Initializes the common fields of an instantiation function
 /// declaration (New) from the corresponding fields of its template (Tmpl).
 ///
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 975d6620c06f8..d947d6d282be0 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -937,6 +937,10 @@ bool Sema::containsUnexpandedParameterPacks(Declarator &D) {
     }
   }
 
+  if (Expr *TRC = D.getTrailingRequiresClause())
+    if (TRC->containsUnexpandedParameterPack())
+      return true;
+  
   return false;
 }
 
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index eb5646729899e..1d775dab67be8 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -11552,6 +11552,13 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
                                                         NewCallOpType);
   }
 
+  // Transform the trailing requires clause
+  ExprResult NewTrailingRequiresClause;
+  if (Expr *TRC = E->getCallOperator()->getTrailingRequiresClause())
+    // FIXME: Concepts: Substitution into requires clause should only happen
+    //                  when checking satisfaction.
+    NewTrailingRequiresClause = getDerived().TransformExpr(TRC);
+
   // Create the local class that will describe the lambda.
   CXXRecordDecl *OldClass = E->getLambdaClass();
   CXXRecordDecl *Class
@@ -11572,7 +11579,8 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
       Class, E->getIntroducerRange(), NewCallOpTSI,
       E->getCallOperator()->getEndLoc(),
       NewCallOpTSI->getTypeLoc().castAs<FunctionProtoTypeLoc>().getParams(),
-      E->getCallOperator()->getConstexprKind());
+      E->getCallOperator()->getConstexprKind(),
+      NewTrailingRequiresClause.get());
 
   LSI->CallOperator = NewCallOperator;
 
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index 3351f76151e35..a132164d30e75 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -822,6 +822,7 @@ void ASTDeclReader::VisitDeclaratorDecl(DeclaratorDecl *DD) {
   if (Record.readInt()) { // hasExtInfo
     auto *Info = new (Reader.getContext()) DeclaratorDecl::ExtInfo();
     Record.readQualifierInfo(*Info);
+    Info->TrailingRequiresClause = Record.readExpr();
     DD->DeclInfo = Info;
   }
   QualType TSIType = Record.readType();
diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp
index 66f4db855a3e9..a553936570b59 100644
--- a/clang/lib/Serialization/ASTWriterDecl.cpp
+++ b/clang/lib/Serialization/ASTWriterDecl.cpp
@@ -523,8 +523,11 @@ void ASTDeclWriter::VisitDeclaratorDecl(DeclaratorDecl *D) {
   VisitValueDecl(D);
   Record.AddSourceLocation(D->getInnerLocStart());
   Record.push_back(D->hasExtInfo());
-  if (D->hasExtInfo())
-    Record.AddQualifierInfo(*D->getExtInfo());
+  if (D->hasExtInfo()) {
+    DeclaratorDecl::ExtInfo *Info = D->getExtInfo();
+    Record.AddQualifierInfo(*Info);
+    Record.AddStmt(Info->TrailingRequiresClause);
+  }
   // The location information is deferred until the end of the record.
   Record.AddTypeRef(D->getTypeSourceInfo() ? D->getTypeSourceInfo()->getType()
                                            : QualType());
diff --git a/clang/test/CXX/class.derived/class.virtual/p6.cpp b/clang/test/CXX/class.derived/class.virtual/p6.cpp
new file mode 100644
index 0000000000000..63a4313de5541
--- /dev/null
+++ b/clang/test/CXX/class.derived/class.virtual/p6.cpp
@@ -0,0 +1,21 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T>
+class A {
+  virtual void f1() requires (sizeof(T) == 0);
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+  virtual void f2() requires (sizeof(T) == 1);
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+};
+
+template<typename T>
+class B : A<T> {
+  virtual void f1() requires (sizeof(T) == 0) override {}
+  // expected-error@-1{{virtual function cannot have a requires clause}}
+};
+
+template<typename T> struct C : T {void f() requires true; };
+// expected-error@-1{{virtual function cannot have a requires clause}}
+struct D { virtual void f(); };
+template struct C<D>;
+// expected-note@-1{{in instantiation of template class 'C<D>' requested here}}
\ No newline at end of file
diff --git a/clang/test/CXX/dcl/dcl.decl/p3.cpp b/clang/test/CXX/dcl/dcl.decl/p3.cpp
new file mode 100644
index 0000000000000..eec0aa2043a2e
--- /dev/null
+++ b/clang/test/CXX/dcl/dcl.decl/p3.cpp
@@ -0,0 +1,35 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr bool is_same_v = false;
+
+template<typename T>
+constexpr bool is_same_v<T, T> = true;
+
+void f1(int a) requires true; // OK
+auto f2(int a) -> bool requires true; // OK
+auto f3(int a) -> bool (*)(int b) requires true; // OK
+auto f4(int a) requires true -> bool; // expected-error{{trailing return type must appear before trailing requires clause}}
+int f5(int a) requires; // expected-error{{expected expression}}
+int f6(int a) requires {} // expected-error{{expected expression}}
+void (f7()) requires true;
+void (f8() requires true); // expected-error{{trailing requires clause should be placed outside parentheses}}
+void (*(f9 requires (true)))(); // expected-error{{trailing requires clause should be placed outside parentheses}}
+static_assert(is_same_v<decltype(f9), void (*)()>);
+void (*pf)() requires true; // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g1(int (*dsdads)() requires false); // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g2(int (*(*dsdads)())() requires true); // expected-error{{trailing requires clause can only be used when declaring a function}}
+void g3(int (*(*dsdads)(int) requires true)() ); // expected-error{{trailing requires clause should be placed outside parentheses}}
+using T = void ();
+T x requires true;
+struct S {
+  T m1 requires true, m2 requires true;
+};
+
+template<typename T>
+struct R {
+    R(T t);
+};
+
+template<typename T>
+R(T) -> R<T> requires true; // expected-error{{deduction guide cannot have a requires clause}}
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp
new file mode 100644
index 0000000000000..fafb3f7b35d9f
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/mixed-constraints.cpp
@@ -0,0 +1,18 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{because 'sizeof(char [20]) <= 10' (20 <= 10) evaluated to false}}
+// expected-note@-2{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+void foo() requires (sizeof(T) <= 8) {}
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with T = char]}}
+// expected-note@-2{{candidate template ignored: constraints not satisfied [with T = char [9]]}}
+// expected-note@-3{{candidate template ignored: constraints not satisfied [with T = char [20]]}}
+// expected-note@-4{{because 'sizeof(char [9]) <= 8' (9 <= 8) evaluated to false}}
+
+void bar() {
+  foo<char>(); // expected-error{{no matching function for call to 'foo'}}
+  foo<int>();
+  foo<unsigned long long int>();
+  foo<char[9]>(); // expected-error{{no matching function for call to 'foo'}}
+  foo<char[20]>(); // expected-error{{no matching function for call to 'foo'}}
+}
\ No newline at end of file
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp
new file mode 100644
index 0000000000000..f13ab279da33a
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.id/p4.cpp
@@ -0,0 +1,58 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+namespace functions
+{
+  void foo(int) requires false {}
+  // expected-note@-1 3{{because 'false' evaluated to false}}
+  // expected-note@-2 {{candidate function not viable: constraints not satisfied}}
+  void bar(int) requires true {}
+
+  void a(int);
+  void a(double);
+
+  void baz() {
+    foo(1); // expected-error{{no matching function for call to 'foo'}}
+    bar(1);
+    void (*p1)(int) = foo; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    void (*p3)(int) = bar;
+    decltype(foo)* a1 = nullptr; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    decltype(bar)* a2 = nullptr;
+  }
+}
+
+namespace methods
+{
+  template<typename T>
+  struct A {
+    static void foo(int) requires (sizeof(T) == 1) {} // expected-note 3{{because 'sizeof(char [2]) == 1' (2 == 1) evaluated to false}}
+    static void bar(int) requires (sizeof(T) == 2) {} // expected-note 3{{because 'sizeof(char) == 2' (1 == 2) evaluated to false}}
+  };
+
+  void baz() {
+    A<char>::foo(1);
+    A<char>::bar(1); // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    A<char[2]>::foo(1); // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    A<char[2]>::bar(1);
+    void (*p1)(int) = A<char>::foo;
+    void (*p2)(int) = A<char>::bar; // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    void (*p3)(int) = A<char[2]>::foo; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    void (*p4)(int) = A<char[2]>::bar;
+    decltype(A<char>::foo)* a1 = nullptr;
+    decltype(A<char>::bar)* a2 = nullptr; // expected-error{{invalid reference to function 'bar': constraints not satisfied}}
+    decltype(A<char[2]>::foo)* a3 = nullptr; // expected-error{{invalid reference to function 'foo': constraints not satisfied}}
+    decltype(A<char[2]>::bar)* a4 = nullptr;
+  }
+}
+
+namespace operators
+{
+  template<typename T>
+  struct A {
+    A<T> operator-(A<T> b) requires (sizeof(T) == 1) { return b; } // expected-note{{because 'sizeof(int) == 1' (4 == 1) evaluated to false}}
+  };
+
+  void baz() {
+    auto* x = &A<int>::operator-; // expected-error{{invalid reference to function 'operator-': constraints not satisfied}}
+    auto y = &A<char>::operator-;
+  }
+}
\ No newline at end of file
diff --git a/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp
new file mode 100644
index 0000000000000..942280e1059fb
--- /dev/null
+++ b/clang/test/CXX/expr/expr.prim/expr.prim.lambda/expr.prim.lambda.closure/p3.cpp
@@ -0,0 +1,20 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+auto l1 = [] (auto x) requires (sizeof(decltype(x)) == 1) { return x; };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with $0 = int]}}
+// expected-note@-2{{because 'sizeof(decltype(x)) == 1' (4 == 1) evaluated to false}}
+
+auto l1t1 = l1('a');
+auto l1t2 = l1(1);
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+
+auto l2 = [] (auto... x) requires ((sizeof(decltype(x)) >= 2) && ...) { return (x + ...); };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with $0 = <char>]}}
+// expected-note@-2{{candidate template ignored: constraints not satisfied [with $0 = <int, char>]}}
+// expected-note@-3 2{{because 'sizeof(decltype(x)) >= 2' (1 >= 2) evaluated to false}}
+
+auto l2t1 = l2('a');
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+auto l2t2 = l2(1, 'a');
+// expected-error@-1{{no matching function for call to object of type '(lambda at}}
+auto l2t3 = l2((short)1, (short)1);
\ No newline at end of file
diff --git a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
new file mode 100644
index 0000000000000..dba2ef204e1d6
--- /dev/null
+++ b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
@@ -0,0 +1,114 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr static bool is_same_v = false;
+
+template<typename T>
+constexpr static bool is_same_v<T, T> = true;
+
+namespace templates
+{
+  template<typename T>
+  concept AtLeast1 = sizeof(T) >= 1;
+
+  template<typename T>
+  int foo(T t) requires (sizeof(T) == 4) { // expected-note {{candidate function}}
+    return 0;
+  }
+
+  template<typename T>
+  char foo(T t) requires AtLeast1<T> { // expected-note {{candidate function}}
+    return 'a';
+  }
+
+  template<typename T>
+  double foo(T t) requires (AtLeast1<T> && sizeof(T) <= 2) {
+    return 'a';
+  }
+
+  static_assert(is_same_v<decltype(foo(10)), int>); // expected-error {{call to 'foo' is ambiguous}}
+  static_assert(is_same_v<decltype(foo(short(10))), double>);
+
+  template<typename T>
+  void bar() requires (sizeof(T) == 1) { }
+  // expected-note@-1{{similar constraint expressions not considered equivalent}}
+  // expected-note@-2{{candidate function [with T = char]}}
+
+  template<typename T>
+  void bar() requires (sizeof(T) == 1 && sizeof(T) >= 0) { }
+  // expected-note@-1{{candidate function [with T = char]}}
+  // expected-note@-2{{similar constraint expression here}}
+
+  static_assert(is_same_v<decltype(bar<char>()), void>);
+  // expected-error@-1{{call to 'bar' is ambiguous}}
+
+  template<typename T>
+  constexpr int baz() requires AtLeast1<T> { // expected-note {{candidate function}}
+    return 1;
+  }
+
+  template<typename T> requires AtLeast1<T>
+  constexpr int baz() { // expected-note {{candidate function [with T = int]}}
+    return 2;
+  }
+
+  static_assert(baz<int>() == 1); // expected-error {{call to 'baz' is ambiguous}}
+}
+
+namespace non_template
+{
+  template<typename T>
+  concept AtLeast2 = sizeof(T) >= 2;
+
+  template<typename T>
+  concept AtMost8 = sizeof(T) <= 8;
+
+  int foo() requires AtLeast2<long> && AtMost8<long> {
+    return 0;
+  }
+
+  double foo() requires AtLeast2<long> {
+    return 0.0;
+  }
+
+  double baz() requires AtLeast2<long> && AtMost8<long> { // expected-note {{candidate function}}
+    return 0.0;
+  }
+
+  int baz() requires AtMost8<long> && AtLeast2<long> { // expected-note {{candidate function}}
+    return 0.0;
+  }
+
+  void bar() requires (sizeof(long) >= 8) { }
+  // expected-note@-1 {{candidate function}}
+  // expected-note@-2 {{similar constraint expressions not considered equivalent}}
+
+  void bar() requires (sizeof(long) >= 8 && sizeof(int) <= 30) { }
+  // expected-note@-1 {{candidate function}}
+  // expected-note@-2 {{similar constraint expression here}}
+
+  static_assert(is_same_v<decltype(foo()), int>);
+  static_assert(is_same_v<decltype(baz()), int>); // expected-error {{call to 'baz' is ambiguous}}
+  static_assert(is_same_v<decltype(bar()), void>); // expected-error {{call to 'bar' is ambiguous}}
+  
+  constexpr int goo(int a) requires AtLeast2<int> && true {
+    return 1;
+  }
+
+  constexpr int goo(const int b) requires AtLeast2<int> {
+    return 2;
+  }
+
+  // Only trailing requires clauses of redeclarations are compared for overload resolution.
+  constexpr int doo(int a, ...) requires AtLeast2<int> && true { // expected-note {{candidate function}}
+    return 1;
+  }
+
+  constexpr int doo(int b) requires AtLeast2<int> { // expected-note {{candidate function}}
+    return 2;
+  }
+
+  static_assert(goo(1) == 1);
+  static_assert(doo(2) == 1); // expected-error {{call to 'doo' is ambiguous}}
+}
+
diff --git a/clang/test/CXX/over/over.match/over.match.viable/p3.cpp b/clang/test/CXX/over/over.match/over.match.viable/p3.cpp
new file mode 100644
index 0000000000000..ef752d76ec23d
--- /dev/null
+++ b/clang/test/CXX/over/over.match/over.match.viable/p3.cpp
@@ -0,0 +1,63 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+struct S2 {};
+// expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'S1' to 'const S2' for 1st argument}}
+// expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'S1' to 'S2' for 1st argument}}
+// expected-note@-3 {{candidate constructor (the implicit default constructor) not viable: requires 0 arguments, but 1 was provided}}
+
+struct S1 {
+  void foo() const requires true {}
+  void foo() const requires false {}
+  void bar() const requires false {}
+  // expected-note@-1 {{because 'false' evaluated to false}}
+  operator bool() const requires true { return true; }
+  explicit operator bool() const requires false;
+  explicit operator S2() const requires false;
+  // expected-note@-1 {{candidate function not viable: constraints not satisfied}}
+  // expected-note@-2 {{because 'false' evaluated to false}}
+};
+
+void foo() {
+  S1().foo();
+  S1().bar();
+  // expected-error@-1 {{invalid reference to function 'bar': constraints not satisfied}}
+  (void) static_cast<bool>(S1());
+  (void) static_cast<S2>(S1());
+  // expected-error@-1 {{no matching conversion for static_cast from 'S1' to 'S2'}}
+}
+
+// Test that constraints are checked before implicit conversions are formed.
+
+template<typename T>
+struct invalid_template { using X = typename T::non_existant; };
+struct A {
+  template<typename T, bool=invalid_template<T>::aadasas>
+  operator T() {}
+};
+
+void foo(int) requires false;
+void foo(A) requires true;
+
+struct S {
+  void foo(int) requires false;
+  void foo(A) requires true;
+  S(A) requires false;
+  S(double) requires true;
+  ~S() requires false;
+  // expected-note@-1 2{{because 'false' evaluated to false}}
+  ~S() requires true;
+  operator int() requires true;
+  operator int() requires false;
+};
+
+void bar() {
+  foo(A{});
+  S{1.}.foo(A{});
+  // expected-error@-1{{invalid reference to function '~S': constraints not satisfied}}
+  // Note - this behavior w.r.t. constrained dtors is a consequence of current
+  // wording, which does not invoke overload resolution when a dtor is called.
+  // P0848 is set to address this issue.
+  S s = 1;
+  // expected-error@-1{{invalid reference to function '~S': constraints not satisfied}}
+  int a = s;
+}
\ No newline at end of file
diff --git a/clang/test/CXX/over/over.over/p4-2a.cpp b/clang/test/CXX/over/over.over/p4-2a.cpp
new file mode 100644
index 0000000000000..a5d7a110992cf
--- /dev/null
+++ b/clang/test/CXX/over/over.over/p4-2a.cpp
@@ -0,0 +1,61 @@
+// RUN:  %clang_cc1 -std=c++2a -fconcepts-ts -verify %s
+
+template<typename T, typename U>
+constexpr static bool is_same_v = false;
+
+template<typename T>
+constexpr static bool is_same_v<T, T> = true;
+
+template<typename T>
+concept AtLeast2 = sizeof(T) >= 2;
+
+template<typename T>
+concept AtMost8 = sizeof(T) <= 8;
+
+int foo() requires AtLeast2<long> && AtMost8<long> {
+  return 0;
+}
+
+double foo() requires AtLeast2<char> {
+  return 0.0;
+}
+
+char bar() requires AtLeast2<char> { // expected-note {{possible target for call}}
+  return 1.0;
+}
+
+short bar() requires AtLeast2<long> && AtMost8<long> {
+// expected-note@-1{{possible target for call}}
+// expected-note@-2{{candidate function}}
+  return 0.0;
+}
+
+int bar() requires AtMost8<long> && AtLeast2<long> {
+// expected-note@-1{{possible target for call}}
+// expected-note@-2{{candidate function}}
+  return 0.0;
+}
+
+char baz() requires AtLeast2<char> {
+  return 1.0;
+}
+
+short baz() requires AtLeast2<long> && AtMost8<long> {
+  return 0.0;
+}
+
+int baz() requires AtMost8<long> && AtLeast2<long> {
+  return 0.0;
+}
+
+long baz() requires AtMost8<long> && AtLeast2<long> && AtLeast2<short> {
+  return 3.0;
+}
+
+void a() {
+  static_assert(is_same_v<decltype(&foo), int(*)()>);
+  static_assert(is_same_v<decltype(&bar), long(*)()>);
+  // expected-error@-1{{reference to overloaded function could not be resolved; did you mean to call it with no arguments?}}
+  // expected-error@-2{{call to 'bar' is ambiguous}}
+  static_assert(is_same_v<decltype(&baz), long(*)()>);
+}
\ No newline at end of file
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
index c1a3a27fbeacc..99de7261a81c8 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/function-templates.cpp
@@ -23,14 +23,13 @@ static_assert(is_same_v<decltype(dereference<int*>(nullptr)), int>);
 static_assert(is_same_v<decltype(dereference(2)), int>); // expected-error {{no matching function for call to 'dereference'}}
 static_assert(is_same_v<decltype(dereference<char>('a')), char>); // expected-error {{no matching function for call to 'dereference'}}
 
-
-template<typename T> requires T{} + T{} // expected-note {{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
+template<typename T> requires (T{} + T{}) // expected-note {{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
 auto foo(T t) { // expected-note {{candidate template ignored: constraints not satisfied [with T = A]}}
   return t + t;
 }
 
 
-template<typename T> requires !((T{} - T{}) && (T{} + T{})) || false
+template<typename T> requires (!((T{} - T{}) && (T{} + T{})) || false)
 // expected-note@-1{{because substituted constraint expression is ill-formed: invalid operands to binary expression ('A' and 'A')}}
 // expected-note@-2{{and 'false' evaluated to false}}
 auto bar(T t) { // expected-note {{candidate template ignored: constraints not satisfied [with T = A]}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
index 24caa5063a1b4..a25b22a9a1544 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/non-function-templates.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 2 // expected-note{{because 'sizeof(char) >= 2' (1 >= 2) evaluated to false}}
+template<typename T> requires (sizeof(T) >= 2) // expected-note{{because 'sizeof(char) >= 2' (1 >= 2) evaluated to false}}
 struct A {
   static constexpr int value = sizeof(T);
 };
@@ -9,8 +9,8 @@ static_assert(A<int>::value == 4);
 static_assert(A<char>::value == 1); // expected-error{{constraints not satisfied for class template 'A' [with T = char]}}
 
 template<typename T, typename U>
-  requires sizeof(T) != sizeof(U) // expected-note{{because 'sizeof(int) != sizeof(char [4])' (4 != 4) evaluated to false}}
-           && sizeof(T) >= 4 // expected-note{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+  requires (sizeof(T) != sizeof(U) // expected-note{{because 'sizeof(int) != sizeof(char [4])' (4 != 4) evaluated to false}}
+            && sizeof(T) >= 4) // expected-note{{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
 constexpr int SizeDiff = sizeof(T) > sizeof(U) ? sizeof(T) - sizeof(U) : sizeof(U) - sizeof(T);
 
 static_assert(SizeDiff<int, char> == 3);
@@ -44,16 +44,16 @@ static_assert(S<S2>::value);
 template<typename T>
 struct AA
 {
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     struct B
     {
         static constexpr int a = 0;
     };
 
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     static constexpr int b = 1;
 
-    template<typename U> requires sizeof(U) == sizeof(T) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
+    template<typename U> requires (sizeof(U) == sizeof(T)) // expected-note{{because 'sizeof(int [2]) == sizeof(int)' (8 == 4) evaluated to false}}
     static constexpr int getB() { // expected-note{{candidate template ignored: constraints not satisfied [with U = int [2]]}}
         return 2;
     }
@@ -85,8 +85,8 @@ template<typename T> requires B<T>::type // expected-note{{in instantiation of t
                                          // expected-note@-1{{while substituting template arguments into constraint expression here}}
 struct C { };
 
-template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
 struct D { };
 
 static_assert(C<int>{}); // expected-note{{while checking constraint satisfaction for template 'C<int>' required here}}
-static_assert(D<int>{}); // expected-note{{while checking constraint satisfaction for template 'D<int>' required here}}
\ No newline at end of file
+static_assert(D<int>{}); // expected-note{{while checking constraint satisfaction for template 'D<int>' required here}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
index 47bd2a5507690..1ea4da29ee9f5 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.constr/partial-specializations.cpp
@@ -2,10 +2,10 @@
 
 namespace class_templates
 {
-  template<typename T, typename U> requires sizeof(T) >= 4 // expected-note {{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
+  template<typename T, typename U> requires (sizeof(T) >= 4) // expected-note {{because 'sizeof(char) >= 4' (1 >= 4) evaluated to false}}
   struct is_same { static constexpr bool value = false; };
 
-  template<typename T> requires sizeof(T*) >= 4 && sizeof(T) >= 4
+  template<typename T> requires (sizeof(T*) >= 4 && sizeof(T) >= 4)
   struct is_same<T*, T*> { static constexpr bool value = true; };
 
   static_assert(!is_same<char*, char*>::value);
@@ -23,7 +23,7 @@ namespace class_templates
                                            // expected-note@-1{{while substituting template arguments into constraint expression here}}
   struct B<T*> {};
 
-  template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+  template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
   struct B<T**> {};
 
   static_assert((B<int**>{}, true)); // expected-note{{while checking constraint satisfaction for class template partial specialization 'B<int *>' required here}}
@@ -35,10 +35,10 @@ namespace class_templates
 
 namespace variable_templates
 {
-  template<typename T, typename U> requires sizeof(T) >= 4
+  template<typename T, typename U> requires (sizeof(T) >= 4)
   constexpr bool is_same_v = false;
 
-  template<typename T> requires sizeof(T*) >= 4 && sizeof(T) >= 4
+  template<typename T> requires (sizeof(T*) >= 4 && sizeof(T) >= 4)
   constexpr bool is_same_v<T*, T*> = true;
 
   static_assert(!is_same_v<char*, char*>);
@@ -55,7 +55,7 @@ namespace variable_templates
                                            // expected-note@-1{{while substituting template arguments into constraint expression here}}
   constexpr bool v1<T*> = true;
 
-  template<typename T> requires T{} // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
+  template<typename T> requires (T{}) // expected-error{{atomic constraint must be of type 'bool' (found 'int')}}
   constexpr bool v1<T**> = true;
 
   static_assert(v1<int**>); // expected-note{{while checking constraint satisfaction for variable template partial specialization 'v1<int *>' required here}}
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
index 5d5361f9c20c3..6f7b80e26a66a 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/class-template-decl.cpp
@@ -2,9 +2,9 @@
 
 namespace nodiag {
 
-template <typename T> requires bool(T())
+template <typename T> requires (bool(T()))
 struct A;
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 struct A;
 
 } // end namespace nodiag
@@ -21,7 +21,7 @@ struct B;
 
 template <typename T> requires true // expected-note{{previous template declaration is here}}
 struct C;
-template <typename T> requires !0 // expected-error{{requires clause differs in template redeclaration}}
+template <typename T> requires (!0) // expected-error{{requires clause differs in template redeclaration}}
 struct C;
 
 } // end namespace diag
@@ -29,15 +29,15 @@ struct C;
 namespace nodiag {
 
 struct AA {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   struct A;
 };
 
-template <typename U> requires someFunc(U())
+template <typename U> requires (someFunc(U()))
 struct AA::A { };
 
 struct AAF {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   friend struct AA::A;
 };
 
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
index c83ab26059d7c..30fbec64eea78 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/func-template-decl.cpp
@@ -2,9 +2,9 @@
 
 namespace nodiag {
 
-template <typename T> requires bool(T())
+template <typename T> requires (bool(T()))
 int A();
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 int A();
 
 } // end namespace nodiag
@@ -26,7 +26,7 @@ int orig::A();
 template <typename T> requires true
 int orig::B();
 // expected-error@-1{{out-of-line declaration of 'B' does not match any declaration in namespace 'diag::orig'}}
-template <typename T> requires !0
+template <typename T> requires (!0)
 int orig::C();
 // expected-error@-1{{out-of-line declaration of 'C' does not match any declaration in namespace 'diag::orig'}}
 
@@ -35,11 +35,11 @@ int orig::C();
 namespace nodiag {
 
 struct AA {
-  template <typename T> requires someFunc(T())
+  template <typename T> requires (someFunc(T()))
   int A();
 };
 
-template <typename T> requires someFunc(T())
+template <typename T> requires (someFunc(T()))
 int AA::A() { return sizeof(T); }
 
 } // end namespace nodiag
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
index cf6874f12d3f5..eabb636b0bbbf 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/var-template-decl.cpp
@@ -3,11 +3,11 @@
 namespace nodiag {
 
 struct B {
-    template <typename T> requires bool(T())
+    template <typename T> requires (bool(T()))
     static int A;
 };
 
-template <typename U> requires bool(U())
+template <typename U> requires (bool(U()))
 int B::A = int(U());
 
 } // end namespace nodiag
@@ -15,11 +15,11 @@ int B::A = int(U());
 namespace diag {
 
 struct B {
-    template <typename T> requires bool(T()) // expected-note{{previous template declaration is here}}
+    template <typename T> requires (bool(T())) // expected-note{{previous template declaration is here}}
     static int A;
 };
 
-template <typename U> requires !bool(U())  // expected-error{{requires clause differs in template redeclaration}}
+template <typename U> requires (!bool(U()))  // expected-error{{requires clause differs in template redeclaration}}
 int B::A = int(U());
 
 } // end namespace diag
\ No newline at end of file
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
index 8c2f552694173..5d41035aa88d7 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/class-template-partial-specializations.cpp
@@ -1,9 +1,12 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalen}}
 class A{}; // expected-note{{template is declared here}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
+
 class A<T>{}; // expected-error{{class template partial specialization is not more specialized than the primary template}}
 
 template<typename T>
@@ -12,7 +15,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 class B{};
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 class B<T>{};
 
 template<typename T>
@@ -48,3 +51,15 @@ struct F<T>{ enum{ value = 3 }; };
 static_assert(F<unsigned>::value == 2);
 static_assert(F<char[10]>::value == 3);
 static_assert(F<char>::value == 1);
+
+// Make sure atomic constraints subsume each other only if their parameter
+// mappings are identical.
+
+template<typename T, typename U> requires C2<T>
+struct I { }; // expected-note {{template is declared here}}
+
+template<typename T, typename U> requires C2<U>
+struct I<T, U> { }; // expected-error {{class template partial specialization is not more specialized than the primary template}}
+
+template<typename T, typename U> requires C2<T> && C2<U>
+struct I<T, U> { };
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
index cc578fe0ad62c..7f68369d52842 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/function-templates.cpp
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalent}}
 bool a() { return false; } // expected-note {{candidate function [with T = unsigned int]}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
 bool a() { return true; } // expected-note {{candidate function [with T = unsigned int]}}
 
 bool av = a<unsigned>(); // expected-error {{call to 'a' is ambiguous}}
@@ -14,7 +16,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 constexpr bool b() { return false; }
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 constexpr bool b() { return true; }
 
 static_assert(b<int>());
@@ -86,4 +88,4 @@ static_assert(sizeof(g<int>()));
 template <unsigned> struct X {};
 template <class...> int h(X<0>);
 template <unsigned b, class...> int h(X<b>);
-static_assert(sizeof(h(X<0>{})));
\ No newline at end of file
+static_assert(sizeof(h(X<0>{})));
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
index b40c77e70a194..cf88e34036dc7 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.order/var-template-partial-specializations.cpp
@@ -1,9 +1,11 @@
 // RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
 
-template<typename T> requires sizeof(T) >= 4
+template<typename T> requires (sizeof(T) >= 4)
+// expected-note@-1{{similar constraint expressions not considered equivalent}}
 bool a = false; // expected-note{{template is declared here}}
 
-template<typename T> requires sizeof(T) >= 4 && sizeof(T) <= 10
+template<typename T> requires (sizeof(T) >= 4 && sizeof(T) <= 10)
+// expected-note@-1{{similar constraint expression here}}
 bool a<T> = true; // expected-error{{variable template partial specialization is not more specialized than the primary template}}
 
 template<typename T>
@@ -12,7 +14,7 @@ concept C1 = sizeof(T) >= 4;
 template<typename T> requires C1<T>
 bool b = false;
 
-template<typename T> requires C1<T> && sizeof(T) <= 10
+template<typename T> requires (C1<T> && sizeof(T) <= 10)
 bool b<T> = true;
 
 template<typename T>
diff --git a/clang/test/CXX/temp/temp.explicit/p8.cpp b/clang/test/CXX/temp/temp.explicit/p8.cpp
new file mode 100644
index 0000000000000..72d2255789960
--- /dev/null
+++ b/clang/test/CXX/temp/temp.explicit/p8.cpp
@@ -0,0 +1,22 @@
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ -verify %s
+
+template<typename T, typename S = char> requires (sizeof(T) + sizeof(S) < 10)
+// expected-note@-1{{because 'sizeof(char [100]) + sizeof(char) < 10' (101 < 10) evaluated to false}}
+void f(T t, S s) requires (sizeof(t) == 1 && sizeof(s) == 1) { };
+// expected-note@-1{{candidate template ignored: constraints not satisfied [with T = int, S = char]}}
+// expected-note@-2{{because 'sizeof (t) == 1' (4 == 1) evaluated to false}}
+// expected-note@-3{{candidate template ignored: constraints not satisfied [with T = char, S = short]}}
+// expected-note@-4{{because 'sizeof (s) == 1' (2 == 1) evaluated to false}}
+// expected-note@-5{{candidate template ignored: constraints not satisfied [with T = char [100], S = char]}}
+
+template<>
+void f<int>(int t, char s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
+
+template<>
+void f<char, short>(char t, short s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
+
+template<>
+void f<char[100]>(char t[100], char s) { };
+// expected-error@-1{{no function template matches function template specialization 'f'}}
\ No newline at end of file
diff --git a/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp b/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
index 12ab338a6b00a..1cd2605ce0556 100644
--- a/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
+++ b/clang/test/Parser/cxx-concepts-ambig-constraint-expr.cpp
@@ -5,25 +5,5 @@
 // the syntax is consumed without backtracking.
 
 // type-specifier-seq in conversion-type-id
-template <typename T> requires (bool)&T::operator short
-unsigned int foo(); // expected-error {{C++ requires a type specifier for all declarations}}
-
-// type-specifier-seq in new-type-id
-template <typename T> requires (bool)sizeof new (T::f()) short
-unsigned int bar(); // expected-error {{C++ requires a type specifier for all declarations}}
-
-template<typename T> requires (bool)sizeof new (T::f()) unsigned // expected-error {{'struct' cannot be signed or unsigned}}
-struct X { }; // expected-error {{'X' cannot be defined in a type specifier}}
-
-// C-style cast
-// of function call on function-style cast
-template <typename T> requires (bool(T()))
-T (*fp)(); // expected-error {{use of undeclared identifier 'fp'}}
-
-// function-style cast
-// as the callee in a function call
-struct A {
-  static int t;
-  template <typename T> requires bool(T())
-  (A(T (&t))) { } // expected-error {{called object type 'bool' is not a function or function pointer}}
-};
+template <typename T> requires T::operator short
+unsigned int foo(); // expected-error {{C++ requires a type specifier for all declarations}}
\ No newline at end of file
diff --git a/clang/test/Parser/cxx-concepts-requires-clause.cpp b/clang/test/Parser/cxx-concepts-requires-clause.cpp
index 01893a94cbc94..60e7004e08187 100644
--- a/clang/test/Parser/cxx-concepts-requires-clause.cpp
+++ b/clang/test/Parser/cxx-concepts-requires-clause.cpp
@@ -1,13 +1,11 @@
-// RUN: %clang_cc1 -std=c++14 -fconcepts-ts -x c++ %s -verify
-// expected-no-diagnostics
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ %s -verify
 
 // Test parsing of the optional requires-clause in a template-declaration.
 
 template <typename T> requires true
 void foo() { }
 
-
-template <typename T> requires !0
+template <typename T> requires (!0)
 struct A {
   void foo();
   struct AA;
@@ -27,31 +25,30 @@ struct A {
   using MQ = M<TT>;
 };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 void A<T>::foo() { }
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 struct A<T>::AA { };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 enum A<T>::E : int { E0 };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 int A<T>::x = 0;
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 void A<T>::Mfoo() { }
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 struct A<T>::M { };
 
-template <typename T> requires !0
+template <typename T> requires (!0)
 template <typename> requires true
 int A<T>::Mx = 0;
 
-
 template <typename T> requires true
 int x = 0;
 
@@ -80,3 +77,81 @@ struct C::M { };
 
 template <typename> requires true
 int C::Mx = 0;
+
+// Test behavior with non-primary-expression requires clauses
+
+template<typename T> requires foo<T>()
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B1 { };
+
+int func() { }
+
+template<typename T> requires func()
+// expected-error@-1{{atomic constraint must be of type 'bool' (found '<overloaded function type>')}}
+// expected-note@-2{{parentheses are required around this expression in a requires clause}}
+struct B2 { };
+
+template<typename T> requires (foo<T>())
+struct B3 { };
+
+template<typename T> requires T{}
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B4 { };
+
+template<typename T> requires sizeof(T) == 0
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B5 { };
+
+template<typename T> requires (sizeof(T)) == 0
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+struct B6 { };
+
+template<typename T> requires 0
+// expected-error@-1{{atomic constraint must be of type 'bool' (found 'int')}}
+(int) bar() { };
+
+template<typename T> requires foo<T>
+(int) bar() { };
+// expected-error@-1{{expected '(' for function-style cast or type construction}}
+
+template<typename T>
+void bar() requires foo<T>();
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires (foo<T>());
+
+template<typename T>
+void bar() requires func();
+// expected-error@-1{{atomic constraint must be of type 'bool' (found '<overloaded function type>')}}
+// expected-note@-2{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires T{};
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires sizeof(T) == 0;
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+template<typename T>
+void bar() requires (sizeof(T)) == 0;
+// expected-error@-1{{parentheses are required around this expression in a requires clause}}
+
+void bar(int x, int y) requires (x, y, true);
+
+struct B {
+  int x;
+  void foo(int y) requires (x, this, this->x, y, true);
+  static void bar(int y) requires (x, true);
+  // expected-error@-1{{'this' cannot be implicitly used in a static member function declaration}}
+  static void baz(int y) requires (this, true);
+  // expected-error@-1{{'this' cannot be used in a static member function declaration}}
+};
+
+auto lambda1 = [] (auto x) requires (sizeof(decltype(x)) == 1) { };
+
+auto lambda2 = [] (auto x) constexpr -> int requires (sizeof(decltype(x)) == 1) { return 0; };
+
+auto lambda3 = [] requires (sizeof(char) == 1) { };
+// expected-error@-1{{lambda requires '()' before 'requires' clause}}
\ No newline at end of file
diff --git a/clang/test/SemaTemplate/instantiate-requires-clause.cpp b/clang/test/SemaTemplate/instantiate-requires-clause.cpp
new file mode 100644
index 0000000000000..f36396b98db73
--- /dev/null
+++ b/clang/test/SemaTemplate/instantiate-requires-clause.cpp
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -std=c++2a -fconcepts-ts -x c++ %s -verify
+
+template <typename... Args> requires ((sizeof(Args) == 1), ...)
+// expected-note@-1 {{because '(sizeof(int) == 1) , (sizeof(char) == 1) , (sizeof(int) == 1)' evaluated to false}}
+void f1(Args&&... args) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+
+using f11 = decltype(f1('a'));
+using f12 = decltype(f1(1, 'b'));
+using f13 = decltype(f1(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f1'}}
+
+template <typename... Args>
+void f2(Args&&... args) requires ((sizeof(args) == 1), ...) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+// expected-note@-2 {{because '(sizeof (args) == 1) , (sizeof (args) == 1) , (sizeof (args) == 1)' evaluated to false}}
+
+using f21 = decltype(f2('a'));
+using f22 = decltype(f2(1, 'b'));
+using f23 = decltype(f2(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f2'}}
+
+template <typename... Args> requires ((sizeof(Args) == 1), ...)
+// expected-note@-1 {{because '(sizeof(int) == 1) , (sizeof(char) == 1) , (sizeof(int) == 1)' evaluated to false}}
+void f3(Args&&... args) requires ((sizeof(args) == 1), ...) { }
+// expected-note@-1 {{candidate template ignored: constraints not satisfied [with Args = <int, char, int>]}}
+
+using f31 = decltype(f3('a'));
+using f32 = decltype(f3(1, 'b'));
+using f33 = decltype(f3(1, 'b', 2));
+// expected-error@-1 {{no matching function for call to 'f3'}}

From 4a83f1e171b73b819bcdb486e363b409d25fc629 Mon Sep 17 00:00:00 2001
From: Hiroshi Inoue <inouehrs@jp.ibm.com>
Date: Thu, 9 Jan 2020 13:03:23 +0000
Subject: [PATCH 52/89] [mlir] fix test failure in EDSC/builder-api-test

This patch fixes a test failure on a non-intel (PowerPC64) box.
The two affine.load are independent and hence llvm may reorder them.
The CHECK lines are modified for supporting reordered case.

Differential Revision: https://reviews.llvm.org/D72435
---
 mlir/test/EDSC/builder-api-test.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/mlir/test/EDSC/builder-api-test.cpp b/mlir/test/EDSC/builder-api-test.cpp
index c776ffe12bdd7..64e1b7094df3c 100644
--- a/mlir/test/EDSC/builder-api-test.cpp
+++ b/mlir/test/EDSC/builder-api-test.cpp
@@ -724,9 +724,10 @@ TEST_FUNC(indirect_access) {
   // clang-format on
 
   // clang-format off
-  // CHECK-LABEL: func @indirect_access(
-  // CHECK:  [[B:%.*]] = affine.load
-  // CHECK:  [[D:%.*]] = affine.load
+  // CHECK-LABEL: func @indirect_access
+  // CHECK-SAME: (%[[ARG0:.*]]: memref<?xf32>, %[[ARG1:.*]]: memref<?xf32>, %[[ARG2:.*]]: memref<?xf32>, %[[ARG3:.*]]: memref<?xf32>)
+  // CHECK-DAG:  [[B:%.*]] = affine.load %[[ARG1]]
+  // CHECK-DAG:  [[D:%.*]] = affine.load %[[ARG3]]
   // CHECK:  load %{{.*}}{{\[}}[[B]]{{\]}}
   // CHECK:  store %{{.*}}, %{{.*}}{{\[}}[[D]]{{\]}}
   // clang-format on

From 5c4661b7784115cb330996b3a6461c5927339aef Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 9 Jan 2020 14:14:54 +0100
Subject: [PATCH 53/89] [lldb] Modernize OptionValue::SetValueChangedCallback

instead of a function pointer + void*, take a std::function. This
removes a bunch of repetitive, unsafe void* casts.
---
 lldb/include/lldb/Interpreter/OptionValue.h   |  16 +--
 .../lldb/Interpreter/OptionValueProperties.h  |   3 +-
 lldb/include/lldb/Interpreter/Property.h      |   3 +-
 lldb/include/lldb/Target/Process.h            |   3 -
 lldb/include/lldb/Target/Target.h             |  29 ++--
 lldb/include/lldb/lldb-private-interfaces.h   |   2 -
 .../Interpreter/OptionValueProperties.cpp     |   4 +-
 lldb/source/Interpreter/Property.cpp          |   5 +-
 lldb/source/Target/Process.cpp                |   9 +-
 lldb/source/Target/Target.cpp                 | 128 +++++++-----------
 10 files changed, 70 insertions(+), 132 deletions(-)

diff --git a/lldb/include/lldb/Interpreter/OptionValue.h b/lldb/include/lldb/Interpreter/OptionValue.h
index 734c92b4bcada..44c7f621a5824 100644
--- a/lldb/include/lldb/Interpreter/OptionValue.h
+++ b/lldb/include/lldb/Interpreter/OptionValue.h
@@ -58,8 +58,7 @@ class OptionValue {
     eDumpGroupExport = (eDumpOptionCommand | eDumpOptionName | eDumpOptionValue)
   };
 
-  OptionValue()
-      : m_callback(nullptr), m_baton(nullptr), m_value_was_set(false) {}
+  OptionValue() : m_value_was_set(false) {}
 
   virtual ~OptionValue() = default;
 
@@ -304,22 +303,19 @@ class OptionValue {
     m_parent_wp = parent_sp;
   }
 
-  void SetValueChangedCallback(OptionValueChangedCallback callback,
-                               void *baton) {
-    assert(m_callback == nullptr);
-    m_callback = callback;
-    m_baton = baton;
+  void SetValueChangedCallback(std::function<void()> callback) {
+    assert(!m_callback);
+    m_callback = std::move(callback);
   }
 
   void NotifyValueChanged() {
     if (m_callback)
-      m_callback(m_baton, this);
+      m_callback();
   }
 
 protected:
   lldb::OptionValueWP m_parent_wp;
-  OptionValueChangedCallback m_callback;
-  void *m_baton;
+  std::function<void()> m_callback;
   bool m_value_was_set; // This can be used to see if a value has been set
                         // by a call to SetValueFromCString(). It is often
                         // handy to know if an option value was set from the
diff --git a/lldb/include/lldb/Interpreter/OptionValueProperties.h b/lldb/include/lldb/Interpreter/OptionValueProperties.h
index bea2b3c91e009..980f01183ef56 100644
--- a/lldb/include/lldb/Interpreter/OptionValueProperties.h
+++ b/lldb/include/lldb/Interpreter/OptionValueProperties.h
@@ -198,8 +198,7 @@ class OptionValueProperties
                                                ConstString name);
 
   void SetValueChangedCallback(uint32_t property_idx,
-                               OptionValueChangedCallback callback,
-                               void *baton);
+                               std::function<void()> callback);
 
 protected:
   Property *ProtectedGetPropertyAtIndex(uint32_t idx) {
diff --git a/lldb/include/lldb/Interpreter/Property.h b/lldb/include/lldb/Interpreter/Property.h
index 797aee4be8159..76264832705ba 100644
--- a/lldb/include/lldb/Interpreter/Property.h
+++ b/lldb/include/lldb/Interpreter/Property.h
@@ -64,8 +64,7 @@ class Property {
                        uint32_t output_width,
                        bool display_qualified_name) const;
 
-  void SetValueChangedCallback(OptionValueChangedCallback callback,
-                               void *baton);
+  void SetValueChangedCallback(std::function<void()> callback);
 
 protected:
   ConstString m_name;
diff --git a/lldb/include/lldb/Target/Process.h b/lldb/include/lldb/Target/Process.h
index 47c5c78704052..2ba996d4995f5 100644
--- a/lldb/include/lldb/Target/Process.h
+++ b/lldb/include/lldb/Target/Process.h
@@ -85,9 +85,6 @@ class ProcessProperties : public Properties {
   std::chrono::seconds GetUtilityExpressionTimeout() const;
 
 protected:
-  static void OptionValueChangedCallback(void *baton,
-                                         OptionValue *option_value);
-
   Process *m_process; // Can be nullptr for global ProcessProperties
 };
 
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index 6f8d60731acf5..1e9153c401ef1 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -209,26 +209,15 @@ class TargetProperties : public Properties {
 
 private:
   // Callbacks for m_launch_info.
-  static void Arg0ValueChangedCallback(void *target_property_ptr,
-                                       OptionValue *);
-  static void RunArgsValueChangedCallback(void *target_property_ptr,
-                                          OptionValue *);
-  static void EnvVarsValueChangedCallback(void *target_property_ptr,
-                                          OptionValue *);
-  static void InheritEnvValueChangedCallback(void *target_property_ptr,
-                                             OptionValue *);
-  static void InputPathValueChangedCallback(void *target_property_ptr,
-                                            OptionValue *);
-  static void OutputPathValueChangedCallback(void *target_property_ptr,
-                                             OptionValue *);
-  static void ErrorPathValueChangedCallback(void *target_property_ptr,
-                                            OptionValue *);
-  static void DetachOnErrorValueChangedCallback(void *target_property_ptr,
-                                                OptionValue *);
-  static void DisableASLRValueChangedCallback(void *target_property_ptr,
-                                              OptionValue *);
-  static void DisableSTDIOValueChangedCallback(void *target_property_ptr,
-                                               OptionValue *);
+  void Arg0ValueChangedCallback();
+  void RunArgsValueChangedCallback();
+  void EnvVarsValueChangedCallback();
+  void InputPathValueChangedCallback();
+  void OutputPathValueChangedCallback();
+  void ErrorPathValueChangedCallback();
+  void DetachOnErrorValueChangedCallback();
+  void DisableASLRValueChangedCallback();
+  void DisableSTDIOValueChangedCallback();
 
   // Member variables.
   ProcessLaunchInfo m_launch_info;
diff --git a/lldb/include/lldb/lldb-private-interfaces.h b/lldb/include/lldb/lldb-private-interfaces.h
index 04b78bcc19f8e..27a2c4c3f27ff 100644
--- a/lldb/include/lldb/lldb-private-interfaces.h
+++ b/lldb/include/lldb/lldb-private-interfaces.h
@@ -82,8 +82,6 @@ typedef bool (*BreakpointHitCallback)(void *baton,
 typedef bool (*WatchpointHitCallback)(void *baton,
                                       StoppointCallbackContext *context,
                                       lldb::user_id_t watch_id);
-typedef void (*OptionValueChangedCallback)(void *baton,
-                                           OptionValue *option_value);
 typedef bool (*ThreadPlanShouldStopHereCallback)(
     ThreadPlan *current_plan, Flags &flags, lldb::FrameComparison operation,
     Status &status, void *baton);
diff --git a/lldb/source/Interpreter/OptionValueProperties.cpp b/lldb/source/Interpreter/OptionValueProperties.cpp
index 4dae930c3a6f4..21750cf186156 100644
--- a/lldb/source/Interpreter/OptionValueProperties.cpp
+++ b/lldb/source/Interpreter/OptionValueProperties.cpp
@@ -60,10 +60,10 @@ void OptionValueProperties::Initialize(const PropertyDefinitions &defs) {
 }
 
 void OptionValueProperties::SetValueChangedCallback(
-    uint32_t property_idx, OptionValueChangedCallback callback, void *baton) {
+    uint32_t property_idx, std::function<void()> callback) {
   Property *property = ProtectedGetPropertyAtIndex(property_idx);
   if (property)
-    property->SetValueChangedCallback(callback, baton);
+    property->SetValueChangedCallback(std::move(callback));
 }
 
 void OptionValueProperties::AppendProperty(ConstString name,
diff --git a/lldb/source/Interpreter/Property.cpp b/lldb/source/Interpreter/Property.cpp
index 78209311e2e51..a81098373c257 100644
--- a/lldb/source/Interpreter/Property.cpp
+++ b/lldb/source/Interpreter/Property.cpp
@@ -292,8 +292,7 @@ void Property::DumpDescription(CommandInterpreter &interpreter, Stream &strm,
   }
 }
 
-void Property::SetValueChangedCallback(OptionValueChangedCallback callback,
-                                       void *baton) {
+void Property::SetValueChangedCallback(std::function<void()> callback) {
   if (m_value_sp)
-    m_value_sp->SetValueChangedCallback(callback, baton);
+    m_value_sp->SetValueChangedCallback(std::move(callback));
 }
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index a8fb32dafa898..6711dc37eca63 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -137,19 +137,12 @@ ProcessProperties::ProcessProperties(lldb_private::Process *process)
         Process::GetGlobalProperties().get());
     m_collection_sp->SetValueChangedCallback(
         ePropertyPythonOSPluginPath,
-        ProcessProperties::OptionValueChangedCallback, this);
+        [this] { m_process->LoadOperatingSystemPlugin(true); });
   }
 }
 
 ProcessProperties::~ProcessProperties() = default;
 
-void ProcessProperties::OptionValueChangedCallback(void *baton,
-                                                   OptionValue *option_value) {
-  ProcessProperties *properties = (ProcessProperties *)baton;
-  if (properties->m_process)
-    properties->m_process->LoadOperatingSystemPlugin(true);
-}
-
 bool ProcessProperties::GetDisableMemoryCache() const {
   const uint32_t idx = ePropertyDisableMemCache;
   return m_collection_sp->GetPropertyAtIndexAsBoolean(
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e35a10a3f6bf8..83e6f3062666a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -3461,29 +3461,24 @@ TargetProperties::TargetProperties(Target *target)
     // Set callbacks to update launch_info whenever "settins set" updated any
     // of these properties
     m_collection_sp->SetValueChangedCallback(
-        ePropertyArg0, TargetProperties::Arg0ValueChangedCallback, this);
+        ePropertyArg0, [this] { Arg0ValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyRunArgs, TargetProperties::RunArgsValueChangedCallback, this);
+        ePropertyRunArgs, [this] { RunArgsValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyEnvVars, TargetProperties::EnvVarsValueChangedCallback, this);
+        ePropertyEnvVars, [this] { EnvVarsValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyInputPath, TargetProperties::InputPathValueChangedCallback,
-        this);
+        ePropertyInputPath, [this] { InputPathValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyOutputPath, TargetProperties::OutputPathValueChangedCallback,
-        this);
+        ePropertyOutputPath, [this] { OutputPathValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyErrorPath, TargetProperties::ErrorPathValueChangedCallback,
-        this);
+        ePropertyErrorPath, [this] { ErrorPathValueChangedCallback(); });
+    m_collection_sp->SetValueChangedCallback(ePropertyDetachOnError, [this] {
+      DetachOnErrorValueChangedCallback();
+    });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyDetachOnError,
-        TargetProperties::DetachOnErrorValueChangedCallback, this);
+        ePropertyDisableASLR, [this] { DisableASLRValueChangedCallback(); });
     m_collection_sp->SetValueChangedCallback(
-        ePropertyDisableASLR, TargetProperties::DisableASLRValueChangedCallback,
-        this);
-    m_collection_sp->SetValueChangedCallback(
-        ePropertyDisableSTDIO,
-        TargetProperties::DisableSTDIOValueChangedCallback, this);
+        ePropertyDisableSTDIO, [this] { DisableSTDIOValueChangedCallback(); });
 
     m_experimental_properties_up.reset(new TargetExperimentalProperties());
     m_collection_sp->AppendProperty(
@@ -3493,16 +3488,16 @@ TargetProperties::TargetProperties(Target *target)
         true, m_experimental_properties_up->GetValueProperties());
 
     // Update m_launch_info once it was created
-    Arg0ValueChangedCallback(this, nullptr);
-    RunArgsValueChangedCallback(this, nullptr);
-    // EnvVarsValueChangedCallback(this, nullptr); // FIXME: cause segfault in
+    Arg0ValueChangedCallback();
+    RunArgsValueChangedCallback();
+    // EnvVarsValueChangedCallback(); // FIXME: cause segfault in
     // Target::GetPlatform()
-    InputPathValueChangedCallback(this, nullptr);
-    OutputPathValueChangedCallback(this, nullptr);
-    ErrorPathValueChangedCallback(this, nullptr);
-    DetachOnErrorValueChangedCallback(this, nullptr);
-    DisableASLRValueChangedCallback(this, nullptr);
-    DisableSTDIOValueChangedCallback(this, nullptr);
+    InputPathValueChangedCallback();
+    OutputPathValueChangedCallback();
+    ErrorPathValueChangedCallback();
+    DetachOnErrorValueChangedCallback();
+    DisableASLRValueChangedCallback();
+    DisableSTDIOValueChangedCallback();
   } else {
     m_collection_sp =
         std::make_shared<TargetOptionValueProperties>(ConstString("target"));
@@ -3975,81 +3970,54 @@ void TargetProperties::SetRequireHardwareBreakpoints(bool b) {
   m_collection_sp->SetPropertyAtIndexAsBoolean(nullptr, idx, b);
 }
 
-void TargetProperties::Arg0ValueChangedCallback(void *target_property_ptr,
-                                                OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.SetArg0(this_->GetArg0());
+void TargetProperties::Arg0ValueChangedCallback() {
+  m_launch_info.SetArg0(GetArg0());
 }
 
-void TargetProperties::RunArgsValueChangedCallback(void *target_property_ptr,
-                                                   OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
+void TargetProperties::RunArgsValueChangedCallback() {
   Args args;
-  if (this_->GetRunArguments(args))
-    this_->m_launch_info.GetArguments() = args;
+  if (GetRunArguments(args))
+    m_launch_info.GetArguments() = args;
 }
 
-void TargetProperties::EnvVarsValueChangedCallback(void *target_property_ptr,
-                                                   OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.GetEnvironment() = this_->GetEnvironment();
+void TargetProperties::EnvVarsValueChangedCallback() {
+  m_launch_info.GetEnvironment() = GetEnvironment();
 }
 
-void TargetProperties::InputPathValueChangedCallback(void *target_property_ptr,
-                                                     OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDIN_FILENO, this_->GetStandardInputPath(), true, false);
+void TargetProperties::InputPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDIN_FILENO, GetStandardInputPath(), true,
+                                     false);
 }
 
-void TargetProperties::OutputPathValueChangedCallback(void *target_property_ptr,
-                                                      OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDOUT_FILENO, this_->GetStandardOutputPath(), false, true);
+void TargetProperties::OutputPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDOUT_FILENO, GetStandardOutputPath(),
+                                     false, true);
 }
 
-void TargetProperties::ErrorPathValueChangedCallback(void *target_property_ptr,
-                                                     OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  this_->m_launch_info.AppendOpenFileAction(
-      STDERR_FILENO, this_->GetStandardErrorPath(), false, true);
+void TargetProperties::ErrorPathValueChangedCallback() {
+  m_launch_info.AppendOpenFileAction(STDERR_FILENO, GetStandardErrorPath(),
+                                     false, true);
 }
 
-void TargetProperties::DetachOnErrorValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDetachOnError())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDetachOnError);
+void TargetProperties::DetachOnErrorValueChangedCallback() {
+  if (GetDetachOnError())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDetachOnError);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDetachOnError);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDetachOnError);
 }
 
-void TargetProperties::DisableASLRValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDisableASLR())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableASLR);
+void TargetProperties::DisableASLRValueChangedCallback() {
+  if (GetDisableASLR())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableASLR);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableASLR);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableASLR);
 }
 
-void TargetProperties::DisableSTDIOValueChangedCallback(
-    void *target_property_ptr, OptionValue *) {
-  TargetProperties *this_ =
-      static_cast<TargetProperties *>(target_property_ptr);
-  if (this_->GetDisableSTDIO())
-    this_->m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableSTDIO);
+void TargetProperties::DisableSTDIOValueChangedCallback() {
+  if (GetDisableSTDIO())
+    m_launch_info.GetFlags().Set(lldb::eLaunchFlagDisableSTDIO);
   else
-    this_->m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableSTDIO);
+    m_launch_info.GetFlags().Clear(lldb::eLaunchFlagDisableSTDIO);
 }
 
 // Target::TargetEventData

From 032a9393a739a1ef8c30979e2493971207c3c397 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 9 Jan 2020 08:20:24 -0500
Subject: [PATCH 54/89] [InstCombine] Use minimal FMF in testcase for Z / (1.0
 / Y) => (Y * Z); NFC

Patch by: @raghesh (Raghesh Aloor)

Differential Revision: https://reviews.llvm.org/D72431
---
 llvm/test/Transforms/InstCombine/fdiv.ll | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 8bfeb67f6e066..178fe1bcef439 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -192,12 +192,12 @@ define float @div_with_div_denominator_extra_use(float %x, float %y, float %z) {
 define float @div_with_div_denominator_with_one_as_numerator_extra_use(float %x, float %y, float %z) {
 ; CHECK-LABEL: @div_with_div_denominator_with_one_as_numerator_extra_use(
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv float 1.000000e+00, [[Y:%.*]]
-; CHECK-NEXT:    [[DIV2:%.*]] = fdiv fast float [[Z:%.*]], [[DIV1]]
+; CHECK-NEXT:    [[DIV2:%.*]] = fdiv reassoc arcp float [[Z:%.*]], [[DIV1]]
 ; CHECK-NEXT:    call void @use_f32(float [[DIV1]])
 ; CHECK-NEXT:    ret float [[DIV2]]
 ;
   %div1 = fdiv float 1.0, %y
-  %div2 = fdiv fast float %z, %div1
+  %div2 = fdiv reassoc arcp float %z, %div1
   call void @use_f32(float %div1)
   ret float %div2
 }

From 173b711e83d7b61a46f55eb44f03ea98f69a1dd6 Mon Sep 17 00:00:00 2001
From: Momchil Velikov <momchil.velikov@arm.com>
Date: Thu, 9 Jan 2020 13:47:52 +0000
Subject: [PATCH 55/89] [ARM][MVE] MVE-I should not be disabled by -mfpu=none

Architecturally, it's allowed to have MVE-I without an FPU, thus
-mfpu=none should not disable MVE-I, or moves to/from FP-registers.

This patch removes `+/-fpregs` from features unconditionally added to
target feature list, depending on FPU and moves the logic to Clang
driver, where the negative form (`-fpregs`) is conditionally added to
the target features list for the cases of `-mfloat-abi=soft`, or
`-mfpu=none` without either `+mve` or `+mve.fp`. Only the negative
form is added by the driver, the positive one is derived from other
features in the backend.

Differential Revision: https://reviews.llvm.org/D71843
---
 clang/lib/Driver/ToolChains/Arch/ARM.cpp      | 43 ++++++++-------
 clang/test/CodeGen/arm-target-features.c      | 28 +++++-----
 clang/test/Driver/arm-mfpu.c                  | 54 +++++++++++++------
 llvm/lib/Support/ARMTargetParser.cpp          |  2 -
 .../CodeGen/ARM/softfp-constant-comparison.ll |  2 +-
 .../dont-remove-loop-update.mir               |  2 +-
 .../dont-remove-loop-update2.mir              |  2 +-
 .../dont-remove-loop-update3.mir              |  2 +-
 .../Thumb2/LowOverheadLoops/fast-fp-loops.ll  |  2 +-
 .../LowOverheadLoops/mve-float-loops.ll       |  2 +-
 .../LoopVectorize/ARM/mve-shiftcost.ll        |  2 +-
 .../Transforms/LoopVectorize/ARM/mve-vldn.ll  |  2 +-
 12 files changed, 82 insertions(+), 61 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Arch/ARM.cpp b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
index 68a57310ad402..ae1d7eaf7089f 100644
--- a/clang/lib/Driver/ToolChains/Arch/ARM.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/ARM.cpp
@@ -63,12 +63,13 @@ static void getARMHWDivFeatures(const Driver &D, const Arg *A,
 }
 
 // Handle -mfpu=.
-static void getARMFPUFeatures(const Driver &D, const Arg *A,
+unsigned getARMFPUFeatures(const Driver &D, const Arg *A,
                               const ArgList &Args, StringRef FPU,
                               std::vector<StringRef> &Features) {
   unsigned FPUID = llvm::ARM::parseFPU(FPU);
   if (!llvm::ARM::getFPUFeatures(FPUID, Features))
     D.Diag(clang::diag::err_drv_clang_unsupported) << A->getAsString(Args);
+  return FPUID;
 }
 
 // Decode ARM features from string like +[no]featureA+[no]featureB+...
@@ -388,18 +389,20 @@ void arm::getARMTargetFeatures(const ToolChain &TC,
     checkARMCPUName(D, CPUArg, Args, CPUName, ArchName,
                     ExtensionFeatures, Triple);
   // Honor -mfpu=. ClangAs gives preference to -Wa,-mfpu=.
+  unsigned FPUID = llvm::ARM::FK_INVALID;
   const Arg *FPUArg = Args.getLastArg(options::OPT_mfpu_EQ);
   if (WaFPU) {
     if (FPUArg)
       D.Diag(clang::diag::warn_drv_unused_argument)
           << FPUArg->getAsString(Args);
-    getARMFPUFeatures(D, WaFPU, Args, StringRef(WaFPU->getValue()).substr(6),
-                      Features);
+    (void)getARMFPUFeatures(D, WaFPU, Args, StringRef(WaFPU->getValue()).substr(6),
+                            Features);
   } else if (FPUArg) {
-    getARMFPUFeatures(D, FPUArg, Args, FPUArg->getValue(), Features);
+    FPUID = getARMFPUFeatures(D, FPUArg, Args, FPUArg->getValue(), Features);
   } else if (Triple.isAndroid() && getARMSubArchVersionNumber(Triple) >= 7) {
     const char *AndroidFPU = "neon";
-    if (!llvm::ARM::getFPUFeatures(llvm::ARM::parseFPU(AndroidFPU), Features))
+    FPUID = llvm::ARM::parseFPU(AndroidFPU);
+    if (!llvm::ARM::getFPUFeatures(FPUID, Features))
       D.Diag(clang::diag::err_drv_clang_unsupported)
           << std::string("-mfpu=") + AndroidFPU;
   }
@@ -454,21 +457,21 @@ void arm::getARMTargetFeatures(const ToolChain &TC,
   if (ABI == arm::FloatABI::Soft) {
     llvm::ARM::getFPUFeatures(llvm::ARM::FK_NONE, Features);
 
-    // Disable all features relating to hardware FP.
-    // FIXME: Disabling fpregs should be enough all by itself, since all
-    //        the other FP features are dependent on it. However
-    //        there is currently no easy way to test this in clang, so for
-    //        now just be explicit and disable all known dependent features
-    //        as well.
-    for (std::string Feature : {
-            "vfp2", "vfp2sp",
-            "vfp3", "vfp3sp", "vfp3d16", "vfp3d16sp",
-            "vfp4", "vfp4sp", "vfp4d16", "vfp4d16sp",
-            "fp-armv8", "fp-armv8sp", "fp-armv8d16", "fp-armv8d16sp",
-            "fullfp16", "neon", "crypto", "dotprod", "fp16fml",
-            "mve", "mve.fp",
-            "fp64", "d32", "fpregs"})
-      Features.push_back(Args.MakeArgString("-" + Feature));
+    // Disable all features relating to hardware FP, not already disabled by the
+    // above call.
+    Features.insert(Features.end(), {"-neon", "-crypto", "-dotprod", "-fp16fml",
+                                     "-mve", "-mve.fp", "-fpregs"});
+  } else if (FPUID == llvm::ARM::FK_NONE) {
+    // -mfpu=none is *very* similar to -mfloat-abi=soft, only that it should not
+    // disable MVE-I.
+    Features.insert(Features.end(),
+                    {"-neon", "-crypto", "-dotprod", "-fp16fml", "-mve.fp"});
+    // Even though we remove MVE-FP, we still need to check if it was originally
+    // present among the requested extensions, because it implies MVE-I, which
+    // should not be disabled by -mfpu-none.
+    if (!llvm::is_contained(Features, "+mve") &&
+        !llvm::is_contained(Features, "+mve.fp"))
+      Features.emplace_back("-fpregs");
   }
 
   // En/disable crc code generation.
diff --git a/clang/test/CodeGen/arm-target-features.c b/clang/test/CodeGen/arm-target-features.c
index 03719f8a9e5dc..11fe4e505439f 100644
--- a/clang/test/CodeGen/arm-target-features.c
+++ b/clang/test/CodeGen/arm-target-features.c
@@ -1,23 +1,23 @@
 // REQUIRES: arm-registered-target
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3
-// CHECK-VFP3: "target-features"="+armv7-a,+d32,+dsp,+fp64,+fpregs,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp"
+// CHECK-VFP3: "target-features"="+armv7-a,+d32,+dsp,+fp64,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4
-// CHECK-VFP4: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu cortex-a7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-a12 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
 // RUN: %clang_cc1 -triple thumbv7s-linux-gnueabi -target-cpu swift -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-2
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabihf -target-cpu krait -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV
-// CHECK-VFP4-DIV: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
-// CHECK-VFP4-DIV-2: "target-features"="+armv7s,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4-DIV: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-VFP4-DIV-2: "target-features"="+armv7s,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a15 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
 // RUN: %clang_cc1 -triple armv7-linux-gnueabihf -target-cpu cortex-a17 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-DIV-ARM
-// CHECK-VFP4-DIV-ARM: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
+// CHECK-VFP4-DIV-ARM: "target-features"="+armv7-a,+d32,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7s-apple-ios7.0 -target-cpu cyclone -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
@@ -26,34 +26,34 @@
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a72 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu cortex-a73 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m3 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8
-// CHECK-BASIC-V8: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-BASIC-V8: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V82
 // RUN: %clang_cc1 -triple thumbv8-linux-gnueabihf -target-cpu exynos-m5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V82
-// CHECK-BASIC-V82: "target-features"="+armv8.2-a,+crc,+crypto,+d32,+dotprod,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
+// CHECK-BASIC-V82: "target-features"="+armv8.2-a,+crc,+crypto,+d32,+dotprod,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+thumb-mode,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp"
 
 // RUN: %clang_cc1 -triple armv8-linux-gnueabi -target-cpu cortex-a53 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-BASIC-V8-ARM
-// CHECK-BASIC-V8-ARM: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
+// CHECK-BASIC-V8-ARM: "target-features"="+armv8-a,+crc,+crypto,+d32,+dsp,+fp-armv8,+fp-armv8d16,+fp-armv8d16sp,+fp-armv8sp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+neon,+vfp2,+vfp2sp,+vfp3,+vfp3d16,+vfp3d16sp,+vfp3sp,+vfp4,+vfp4d16,+vfp4d16sp,+vfp4sp,-thumb-mode"
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r5 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-DIV
-// CHECK-VFP3-D16-DIV: "target-features"="+armv7-r,+dsp,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
+// CHECK-VFP3-D16-DIV: "target-features"="+armv7-r,+dsp,+fp64,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4f -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-THUMB-DIV
-// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+armv7-r,+dsp,+fp64,+fpregs,+hwdiv,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,-thumb-mode"
+// CHECK-VFP3-D16-THUMB-DIV: "target-features"="+armv7-r,+dsp,+fp64,+hwdiv,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,-thumb-mode"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-r8 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP3-D16-FP16-DIV
-// CHECK-VFP3-D16-FP16-DIV: "target-features"="+armv7-r,+dsp,+fp16,+fp64,+fpregs,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
+// CHECK-VFP3-D16-FP16-DIV: "target-features"="+armv7-r,+dsp,+fp16,+fp64,+hwdiv,+hwdiv-arm,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP4-D16-SP-THUMB-DIV
-// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
+// CHECK-VFP4-D16-SP-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
 
 
 // RUN: %clang_cc1 -triple thumbv7-linux-gnueabi -target-cpu cortex-m7 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-VFP5-D16-THUMB-DIV
-// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+hwdiv,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp"
+// CHECK-VFP5-D16-THUMB-DIV: "target-features"="+armv7e-m,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+hwdiv,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp"
 
 
 // RUN: %clang_cc1 -triple armv7-linux-gnueabi -target-cpu cortex-r4 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-THUMB-DIV
@@ -105,6 +105,6 @@
 // CHECK-ARMV8M-M23-LINUX: "target-features"="+armv8-m.base,+hwdiv,+thumb-mode"
 
 // RUN: %clang_cc1 -triple thumb-linux-gnueabi -target-cpu cortex-m33 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-ARMV8M-MAIN-LINUX 
-// CHECK-ARMV8M-MAIN-LINUX: "target-features"="+armv8-m.main,+dsp,+fp-armv8d16sp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
+// CHECK-ARMV8M-MAIN-LINUX: "target-features"="+armv8-m.main,+dsp,+fp-armv8d16sp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp"
 
 void foo() {}
diff --git a/clang/test/Driver/arm-mfpu.c b/clang/test/Driver/arm-mfpu.c
index b709622f72075..c3731fa5bd635 100644
--- a/clang/test/Driver/arm-mfpu.c
+++ b/clang/test/Driver/arm-mfpu.c
@@ -84,7 +84,7 @@
 // CHECK-VFP3-D16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3-D16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP3-D16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP3-D16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3-D16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=vfpv3-d16-fp16 %s -### -o %t.o 2>&1 \
@@ -98,7 +98,7 @@
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP3-D16-FP16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-neon"
 // CHECK-VFP3-D16-FP16-DAG: "-target-feature" "-crypto"
 
@@ -108,8 +108,8 @@
 // RUN:   | FileCheck --check-prefix=CHECK-SOFT-ABI-FP-3 %s
 // CHECK-VFP3XD-NOT: "-target-feature" "+soft-float"
 // CHECK-VFP3XD-DAG: "-target-feature" "+soft-float-abi"
-// CHECK-VFP3XD-NOT: "-target-feature" "+fp64"
-// CHECK-VFP3XD-NOT: "-target-feature" "+d32"
+// CHECK-VFP3XD-DAG: "-target-feature" "-fp64"
+// CHECK-VFP3XD-DAG: "-target-feature" "-d32"
 // CHECK-VFP3XD-DAG: "-target-feature" "+vfp3d16sp"
 // CHECK-VFP3XD-DAG: "-target-feature" "-fp16"
 // CHECK-VFP3XD-DAG: "-target-feature" "-vfp4d16sp"
@@ -127,8 +127,8 @@
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "+fp16"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-VFP3XD-FP16-NOT: "-target-feature" "+fp64"
-// CHECK-VFP3XD-FP16-NOT: "-target-feature" "+d32"
+// CHECK-VFP3XD-FP16-DAG: "-target-feature" "-fp64"
+// CHECK-VFP3XD-FP16-DAG: "-target-feature" "-d32"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-neon"
 // CHECK-VFP3XD-FP16-DAG: "-target-feature" "-crypto"
 
@@ -162,7 +162,7 @@
 // CHECK-VFP4-D16-DAG: "-target-feature" "+vfp4d16"
 // CHECK-VFP4-D16-DAG: "-target-feature" "-fp-armv8d16sp"
 // CHECK-VFP4-D16-DAG: "-target-feature" "+fp64"
-// CHECK-VFP4-D16-NOT: "-target-feature" "+d32"
+// CHECK-VFP4-D16-DAG: "-target-feature" "-d32"
 // CHECK-VFP4-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp4-sp-d16 %s -### -o %t.o 2>&1 \
@@ -175,8 +175,8 @@
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "+vfp4d16sp"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-FP4-SP-D16-NOT: "-target-feature" "+fp64"
-// CHECK-FP4-SP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP4-SP-D16-DAG: "-target-feature" "-fp64"
+// CHECK-FP4-SP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP4-SP-D16-DAG: "-target-feature" "-neon"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp5-sp-d16 %s -### -o %t.o 2>&1 \
@@ -189,8 +189,8 @@
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "+fp-armv8d16sp"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "-neon"
-// CHECK-FP5-SP-D16-NOT: "-target-feature" "+fp64"
-// CHECK-FP5-SP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP5-SP-D16-DAG: "-target-feature" "-fp64"
+// CHECK-FP5-SP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP5-SP-D16-DAG: "-target-feature" "-crypto"
 
 // RUN: %clang -target arm-linux-eabi -mfpu=fp5-dp-d16 %s -### -o %t.o 2>&1 \
@@ -203,7 +203,7 @@
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+fp-armv8d16"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "+fp64"
-// CHECK-FP5-DP-D16-NOT: "-target-feature" "+d32"
+// CHECK-FP5-DP-D16-DAG: "-target-feature" "-d32"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "-neon"
 // CHECK-FP5-DP-D16-DAG: "-target-feature" "-crypto"
 // CHECK-SOFT-ABI-FP-5-DAG: "-target-feature" "+soft-float"
@@ -323,8 +323,8 @@
 // CHECK-NO-FP-DAG: "-target-feature" "-vfp3d16sp"
 // CHECK-NO-FP-DAG: "-target-feature" "-vfp4d16sp"
 // CHECK-NO-FP-DAG: "-target-feature" "-fp-armv8d16sp"
-// CHECK-NO-FP-NOT: "-target-feature" "+fp64"
-// CHECK-NO-FP-NOT: "-target-feature" "+d32"
+// CHECK-NO-FP-DAG: "-target-feature" "-fp64"
+// CHECK-NO-FP-DAG: "-target-feature" "-d32"
 // CHECK-NO-FP-DAG: "-target-feature" "-neon"
 // CHECK-NO-FP-DAG: "-target-feature" "-crypto"
 
@@ -382,8 +382,8 @@
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+soft-float"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+soft-float-abi"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+vfp3"
-// CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+vfp4"
-// CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+fp-armv8"
+// CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "-vfp4"
+// CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "-fp-armv8"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-DAG: "-target-feature" "+neon"
 // CHECK-ARM7-ANDROID-FP-DEFAULT-NOT: "-target-feature" "+crypto"
 
@@ -391,7 +391,7 @@
 // RUN:   | FileCheck --check-prefix=CHECK-ARM7-ANDROID-FP-D16 %s
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+soft-float"
 // CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "+soft-float-abi"
-// CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+d32"
+// CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "-d32"
 // CHECK-ARM7-ANDROID-FP-D16-DAG: "-target-feature" "+vfp3d16"
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+vfp4"
 // CHECK-ARM7-ANDROID-FP-D16-NOT: "-target-feature" "+fp-armv8"
@@ -403,3 +403,23 @@
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-NOT: "-target-feature" "+mve"
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-DAG: "-target-feature" "-mve"
 // CHECK-SOFTFLOATABI-INHIBITS-MVE-DAG: "-target-feature" "-mve.fp"
+
+// RUN: %clang -target arm-none-none-eabi %s -march=armv8.1-m.main+mve.fp -mfpu=none -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MVEFP-FPUNONE %s
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp2sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp3d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-vfp4d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-fp-armv8d16sp"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-fp64"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-d32"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-neon"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-crypto"
+// CHECK-MVEFP-FPUNONE-DAG: "-target-feature" "-mve.fp"
+// CHECK-MVEFP-FPUNONE-NOT: "-target-feature" "-fpregs"
+
+
+// RUN: %clang -target arm-none-none-eabi %s -march=armv8.1-m.main+mve -mfpu=none -### -c 2>&1 \
+// RUN:   | FileCheck --check-prefix=CHECK-MVEI-FPUNONE %s
+// CHECK-MVEI-FPUNONE-DAG: "-target-feature" "-mve.fp"
+// CHECK-MVEI-FPUNONE-DAG: "-target-feature" "+mve"
+// CHECK-MVEI-FPUNONE-NOT: "-target-feature" "-fpregs"
diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp
index ce5daa7fe58c0..f2c22fd93c8b8 100644
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@@ -174,8 +174,6 @@ bool ARM::getFPUFeatures(unsigned FPUKind, std::vector<StringRef> &Features) {
     // under FPURestriction::None, which is the only FPURestriction in
     // which they would be valid (since FPURestriction::SP doesn't
     // exist).
-
-    {"+fpregs", "-fpregs", FPUVersion::VFPV2, FPURestriction::SP_D16},
     {"+vfp2", "-vfp2", FPUVersion::VFPV2, FPURestriction::D16},
     {"+vfp2sp", "-vfp2sp", FPUVersion::VFPV2, FPURestriction::SP_D16},
     {"+vfp3", "-vfp3", FPUVersion::VFPV3, FPURestriction::None},
diff --git a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
index f70e9f378f3b0..e7e4d2bf26f2b 100644
--- a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
+++ b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
@@ -43,4 +43,4 @@ land.end:                                         ; preds = %land.rhs, %entry
   ret void
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+fpregs,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
index 976c5f5d7ba36..6dd8caafc33e7 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update.mir
@@ -60,7 +60,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
index 9e429040db4fd..d49a1e86109b8 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update2.mir
@@ -62,7 +62,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
index ab7fcf843d7dc..bf1c40fb34e5f 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-remove-loop-update3.mir
@@ -62,7 +62,7 @@
   declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #3
   declare void @llvm.stackprotector(i8*, i8**) #5
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="true" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
   attributes #2 = { nounwind readnone }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
index ddf51b785ff0d..e25e0298eb243 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 -disable-mve-tail-predication=false %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 -disable-mve-tail-predication=false %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @fast_float_mul(float* nocapture %a, float* nocapture readonly %b, float* nocapture readonly %c, i32 %N) {
 ; CHECK-LABEL: fast_float_mul:
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
index ebb041d937224..b152191798bcf 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-float-loops.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16 %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp,+fp-armv8d16sp,+fp16,+fullfp16 %s -o - | FileCheck %s
 
 define arm_aapcs_vfpcc void @float_float_mul(float* nocapture readonly %a, float* nocapture readonly %b, float* nocapture %c, i32 %N) {
 ; CHECK-LABEL: float_float_mul:
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
index f005845f2db85..7195e61c280fc 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-shiftcost.ll
@@ -84,4 +84,4 @@ exit:
   ret void
 }
 
-attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
+attributes #0 = { nounwind "min-legal-vector-width"="0" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
index cb6e1005db1c2..353e725580a4a 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-vldn.ll
@@ -84,4 +84,4 @@ while.end:                                        ; preds = %while.body, %entry
   ret void
 }
 
-attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+attributes #0 = { "target-features"="+armv8.1-m.main,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-crypto,-d32,-fp-armv8,-fp-armv8sp,-neon,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }

From f53b38d12a7b9c6754d5bc91483efab935b5c012 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 9 Jan 2020 09:02:53 -0500
Subject: [PATCH 56/89] [InstSimplify] select Cond, true, false --> Cond

This is step 1 of damage control assuming that we need to remove several
over-reaching folds for select-of-booleans because they can cause
miscompiles as shown in D72396.

The scalar case seems obviously safe:
https://rise4fun.com/Alive/jSj

And I don't think there's any danger for vectors either - if the
condition is poisoned, then the select must be poisoned too, so undef
elements don't make any difference.

Differential Revision: https://reviews.llvm.org/D72412
---
 llvm/lib/Analysis/InstructionSimplify.cpp   | 9 +++++++++
 llvm/test/Transforms/InstSimplify/select.ll | 9 +++------
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index afcca2ab1fa3a..d7510c8991013 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -3996,6 +3996,15 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
       return FalseVal;
   }
 
+  // select i1 Cond, i1 true, i1 false --> i1 Cond
+  assert(Cond->getType()->isIntOrIntVectorTy(1) &&
+         "Select must have bool or bool vector condition");
+  assert(TrueVal->getType() == FalseVal->getType() &&
+         "Select must have same types for true/false ops");
+  if (Cond->getType() == TrueVal->getType() &&
+      match(TrueVal, m_One()) && match(FalseVal, m_ZeroInt()))
+    return Cond;
+
   // select ?, X, X -> X
   if (TrueVal == FalseVal)
     return TrueVal;
diff --git a/llvm/test/Transforms/InstSimplify/select.ll b/llvm/test/Transforms/InstSimplify/select.ll
index 5581a3bd74719..4fe499ad4a24e 100644
--- a/llvm/test/Transforms/InstSimplify/select.ll
+++ b/llvm/test/Transforms/InstSimplify/select.ll
@@ -3,8 +3,7 @@
 
 define i1 @bool_true_or_false(i1 %cond) {
 ; CHECK-LABEL: @bool_true_or_false(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i1 true, i1 false
-; CHECK-NEXT:    ret i1 [[S]]
+; CHECK-NEXT:    ret i1 [[COND:%.*]]
 ;
   %s = select i1 %cond, i1 true, i1 false
   ret i1 %s
@@ -12,8 +11,7 @@ define i1 @bool_true_or_false(i1 %cond) {
 
 define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
 ; CHECK-LABEL: @bool_true_or_false_vec(
-; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i1> <i1 true, i1 true>, <2 x i1> zeroinitializer
-; CHECK-NEXT:    ret <2 x i1> [[S]]
+; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
   %s = select <2 x i1> %cond, <2 x i1> <i1 true, i1 true>, <2 x i1> zeroinitializer
   ret <2 x i1> %s
@@ -21,8 +19,7 @@ define <2 x i1> @bool_true_or_false_vec(<2 x i1> %cond) {
 
 define <2 x i1> @bool_true_or_false_vec_undef(<2 x i1> %cond) {
 ; CHECK-LABEL: @bool_true_or_false_vec_undef(
-; CHECK-NEXT:    [[S:%.*]] = select <2 x i1> [[COND:%.*]], <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
-; CHECK-NEXT:    ret <2 x i1> [[S]]
+; CHECK-NEXT:    ret <2 x i1> [[COND:%.*]]
 ;
   %s = select <2 x i1> %cond, <2 x i1> <i1 undef, i1 true>, <2 x i1> <i1 false, i1 undef>
   ret <2 x i1> %s

From 84ce46269cfda8800346706251ac3587b2d1c9f5 Mon Sep 17 00:00:00 2001
From: Saar Raz <saar@raz.email>
Date: Thu, 9 Jan 2020 16:13:46 +0200
Subject: [PATCH 57/89] [Concepts] Fix failing test on Windows

Fix test failed by D43357 on Windows.
---
 clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
index dba2ef204e1d6..36c68071448c7 100644
--- a/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
+++ b/clang/test/CXX/over/over.match/over.match.best/p1-2a.cpp
@@ -79,11 +79,11 @@ namespace non_template
     return 0.0;
   }
 
-  void bar() requires (sizeof(long) >= 8) { }
+  void bar() requires (sizeof(char[8]) >= 8) { }
   // expected-note@-1 {{candidate function}}
   // expected-note@-2 {{similar constraint expressions not considered equivalent}}
 
-  void bar() requires (sizeof(long) >= 8 && sizeof(int) <= 30) { }
+  void bar() requires (sizeof(char[8]) >= 8 && sizeof(int) <= 30) { }
   // expected-note@-1 {{candidate function}}
   // expected-note@-2 {{similar constraint expression here}}
 
@@ -111,4 +111,3 @@ namespace non_template
   static_assert(goo(1) == 1);
   static_assert(doo(2) == 1); // expected-error {{call to 'doo' is ambiguous}}
 }
-

From cb5612e2df893728887bedd41aa2293f454c7845 Mon Sep 17 00:00:00 2001
From: Sanjay Patel <spatel@rotateright.com>
Date: Thu, 9 Jan 2020 09:36:22 -0500
Subject: [PATCH 58/89] [DAGCombiner] reduce extract subvector of concat

If we are extracting a chunk of a vector that's a fraction of an
operand of the concatenated vector operand, we can extract directly
from one of those original operands.

This is another suggestion from PR42024:
https://bugs.llvm.org/show_bug.cgi?id=42024#c2

But I'm not sure yet if it will make any difference on those patterns.
It seems to help a few existing AVX512 tests though.

Differential Revision: https://reviews.llvm.org/D72361
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-
 llvm/test/CodeGen/X86/avg.ll                  | 12 ++-
 llvm/test/CodeGen/X86/pr34657.ll              | 13 ++--
 .../CodeGen/X86/x86-interleaved-access.ll     | 76 +++++--------------
 4 files changed, 45 insertions(+), 74 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 37b1b17218d02..6030c95742015 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18594,8 +18594,22 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     if (ConcatSrcNumElts == ExtNumElts)
       return V.getOperand(ConcatOpIdx);
 
-    // TODO: Handle the case where the concat operands are larger than the
-    //       result of this extract by extracting directly from a concat op.
+    // If the concatenated source vectors are a multiple length of this extract,
+    // then extract a fraction of one of those source vectors directly from a
+    // concat operand. Example:
+    //   v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
+    //   v2i8 extract_subvec v8i8 Y, 6
+    if (ConcatSrcNumElts % ExtNumElts == 0) {
+      SDLoc DL(N);
+      unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
+      assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
+             "Trying to extract from >1 concat operand?");
+      assert(NewExtIdx % ExtNumElts == 0 &&
+             "Extract index is not a multiple of the input vector length.");
+      SDValue NewIndexC = DAG.getIntPtrConstant(NewExtIdx, DL);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
+                         V.getOperand(ConcatOpIdx), NewIndexC);
+    }
   }
 
   V = peekThroughBitcasts(V);
diff --git a/llvm/test/CodeGen/X86/avg.ll b/llvm/test/CodeGen/X86/avg.ll
index a5fd84c32ed51..e4a5d1392c0bf 100644
--- a/llvm/test/CodeGen/X86/avg.ll
+++ b/llvm/test/CodeGen/X86/avg.ll
@@ -462,14 +462,12 @@ define void @avg_v48i8(<48 x i8>* %a, <48 x i8>* %b) nounwind {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %xmm2
-; AVX512BW-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpavgb (%rsi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vpavgb 32(%rsi), %xmm2, %xmm1
-; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm1
-; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
-; AVX512BW-NEXT:    vextracti32x4 $2, %zmm1, (%rax)
-; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    vpavgb 16(%rsi), %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovdqu %xmm1, (%rax)
+; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vmovdqu %xmm2, (%rax)
 ; AVX512BW-NEXT:    retq
   %1 = load <48 x i8>, <48 x i8>* %a
   %2 = load <48 x i8>, <48 x i8>* %b
diff --git a/llvm/test/CodeGen/X86/pr34657.ll b/llvm/test/CodeGen/X86/pr34657.ll
index d8b72920fed15..9761927dc239b 100644
--- a/llvm/test/CodeGen/X86/pr34657.ll
+++ b/llvm/test/CodeGen/X86/pr34657.ll
@@ -5,13 +5,12 @@ define <112 x i8> @pr34657(<112 x i8>* %src) local_unnamed_addr {
 ; CHECK-LABEL: pr34657:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    vmovups 64(%rsi), %ymm0
-; CHECK-NEXT:    vbroadcastf128 {{.*#+}} ymm1 = mem[0,1,0,1]
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm1
-; CHECK-NEXT:    vmovups (%rsi), %zmm2
-; CHECK-NEXT:    vmovaps %ymm0, 64(%rdi)
-; CHECK-NEXT:    vmovaps %zmm2, (%rdi)
-; CHECK-NEXT:    vextractf32x4 $2, %zmm1, 96(%rdi)
+; CHECK-NEXT:    vmovups (%rsi), %zmm0
+; CHECK-NEXT:    vmovups 64(%rsi), %ymm1
+; CHECK-NEXT:    vmovups 96(%rsi), %xmm2
+; CHECK-NEXT:    vmovaps %xmm2, 96(%rdi)
+; CHECK-NEXT:    vmovaps %ymm1, 64(%rdi)
+; CHECK-NEXT:    vmovaps %zmm0, (%rdi)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index fcdebfa68a5e7..74a83214bf208 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1055,64 +1055,24 @@ ret void
 }
 
 define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, <48 x i8>* %p) {
-; AVX1-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqu %xmm0, 16(%rdi)
-; AVX1-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX1-NEXT:    vmovdqu %xmm2, 32(%rdi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqu %xmm0, 16(%rdi)
-; AVX2-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX2-NEXT:    vmovdqu %xmm2, 32(%rdi)
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: interleaved_store_vf16_i8_stride3:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX512-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
-; AVX512-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm1
-; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
-; AVX512-NEXT:    vextracti32x4 $2, %zmm1, 32(%rdi)
-; AVX512-NEXT:    vzeroupper
-; AVX512-NEXT:    retq
+; AVX-LABEL: interleaved_store_vf16_i8_stride3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4]
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4]
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5]
+; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
+; AVX-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
+; AVX-NEXT:    vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4]
+; AVX-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
+; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
+; AVX-NEXT:    vmovdqu %xmm2, 32(%rdi)
+; AVX-NEXT:    retq
 %1 = shufflevector <16 x i8> %a, <16 x i8> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
 %2 = shufflevector <16 x i8> %c, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 %interleaved.vec = shufflevector <32 x i8> %1, <32 x i8> %2, <48 x i32> <i32 0, i32 16, i32 32, i32 1, i32 17, i32 33, i32 2, i32 18, i32 34, i32 3, i32 19, i32 35, i32 4, i32 20, i32 36, i32 5, i32 21, i32 37, i32 6, i32 22, i32 38, i32 7, i32 23, i32 39, i32 8, i32 24, i32 40, i32 9, i32 25, i32 41, i32 10, i32 26, i32 42, i32 11, i32 27, i32 43, i32 12, i32 28, i32 44, i32 13, i32 29, i32 45, i32 14, i32 30, i32 46, i32 15, i32 31, i32 47>

From 4c11703b3d9d936214b8ced70bd6475974c317d8 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@hotmail.com>
Date: Thu, 9 Jan 2020 09:28:59 -0500
Subject: [PATCH 59/89] [OPENMP]Remove unused code, NFC.

---
 clang/lib/CodeGen/CGOpenMPRuntime.cpp | 75 ---------------------------
 clang/lib/CodeGen/CGOpenMPRuntime.h   | 29 +----------
 2 files changed, 2 insertions(+), 102 deletions(-)

diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ee7a2e9204ace..0d96c5a2f73f4 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -733,10 +733,6 @@ enum OpenMPRTLFunction {
   OMPRTL__tgt_target_teams_nowait,
   // Call to void __tgt_register_requires(int64_t flags);
   OMPRTL__tgt_register_requires,
-  // Call to void __tgt_register_lib(__tgt_bin_desc *desc);
-  OMPRTL__tgt_register_lib,
-  // Call to void __tgt_unregister_lib(__tgt_bin_desc *desc);
-  OMPRTL__tgt_unregister_lib,
   // Call to void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
   // void** args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
   OMPRTL__tgt_target_data_begin,
@@ -2478,26 +2474,6 @@ llvm::FunctionCallee CGOpenMPRuntime::createRuntimeFunction(unsigned Function) {
     RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_requires");
     break;
   }
-  case OMPRTL__tgt_register_lib: {
-    // Build void __tgt_register_lib(__tgt_bin_desc *desc);
-    QualType ParamTy =
-        CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
-    llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_register_lib");
-    break;
-  }
-  case OMPRTL__tgt_unregister_lib: {
-    // Build void __tgt_unregister_lib(__tgt_bin_desc *desc);
-    QualType ParamTy =
-        CGM.getContext().getPointerType(getTgtBinaryDescriptorQTy());
-    llvm::Type *TypeParams[] = {CGM.getTypes().ConvertTypeForMem(ParamTy)};
-    auto *FnTy =
-        llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
-    RTLFn = CGM.CreateRuntimeFunction(FnTy, "__tgt_unregister_lib");
-    break;
-  }
   case OMPRTL__tgt_target_data_begin: {
     // Build void __tgt_target_data_begin(int64_t device_id, int32_t arg_num,
     // void** args_base, void **args, int64_t *arg_sizes, int64_t *arg_types);
@@ -4378,57 +4354,6 @@ QualType CGOpenMPRuntime::getTgtOffloadEntryQTy() {
   return TgtOffloadEntryQTy;
 }
 
-QualType CGOpenMPRuntime::getTgtDeviceImageQTy() {
-  // These are the types we need to build:
-  // struct __tgt_device_image{
-  // void   *ImageStart;       // Pointer to the target code start.
-  // void   *ImageEnd;         // Pointer to the target code end.
-  // // We also add the host entries to the device image, as it may be useful
-  // // for the target runtime to have access to that information.
-  // __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all
-  //                                       // the entries.
-  // __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  //                                       // entries (non inclusive).
-  // };
-  if (TgtDeviceImageQTy.isNull()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *RD = C.buildImplicitRecord("__tgt_device_image");
-    RD->startDefinition();
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.VoidPtrTy);
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    RD->completeDefinition();
-    TgtDeviceImageQTy = C.getRecordType(RD);
-  }
-  return TgtDeviceImageQTy;
-}
-
-QualType CGOpenMPRuntime::getTgtBinaryDescriptorQTy() {
-  // struct __tgt_bin_desc{
-  //   int32_t              NumDevices;      // Number of devices supported.
-  //   __tgt_device_image   *DeviceImages;   // Arrays of device images
-  //                                         // (one per device).
-  //   __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the
-  //                                         // entries.
-  //   __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  //                                         // entries (non inclusive).
-  // };
-  if (TgtBinaryDescriptorQTy.isNull()) {
-    ASTContext &C = CGM.getContext();
-    RecordDecl *RD = C.buildImplicitRecord("__tgt_bin_desc");
-    RD->startDefinition();
-    addFieldToRecordDecl(
-        C, RD, C.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/true));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtDeviceImageQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    addFieldToRecordDecl(C, RD, C.getPointerType(getTgtOffloadEntryQTy()));
-    RD->completeDefinition();
-    TgtBinaryDescriptorQTy = C.getRecordType(RD);
-  }
-  return TgtBinaryDescriptorQTy;
-}
-
 namespace {
 struct PrivateHelpersTy {
   PrivateHelpersTy(const VarDecl *Original, const VarDecl *PrivateCopy,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index c40308ee74971..8159f5e8b790f 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -439,29 +439,10 @@ class CGOpenMPRuntime {
   ///                          // (function or global)
   ///   char      *name;       // Name of the function or global.
   ///   size_t     size;       // Size of the entry info (0 if it a function).
+  ///   int32_t flags;
+  ///   int32_t reserved;
   /// };
   QualType TgtOffloadEntryQTy;
-  /// struct __tgt_device_image{
-  /// void   *ImageStart;       // Pointer to the target code start.
-  /// void   *ImageEnd;         // Pointer to the target code end.
-  /// // We also add the host entries to the device image, as it may be useful
-  /// // for the target runtime to have access to that information.
-  /// __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all
-  ///                                       // the entries.
-  /// __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  ///                                       // entries (non inclusive).
-  /// };
-  QualType TgtDeviceImageQTy;
-  /// struct __tgt_bin_desc{
-  ///   int32_t              NumDevices;      // Number of devices supported.
-  ///   __tgt_device_image   *DeviceImages;   // Arrays of device images
-  ///                                         // (one per device).
-  ///   __tgt_offload_entry  *EntriesBegin;   // Begin of the table with all the
-  ///                                         // entries.
-  ///   __tgt_offload_entry  *EntriesEnd;     // End of the table with all the
-  ///                                         // entries (non inclusive).
-  /// };
-  QualType TgtBinaryDescriptorQTy;
   /// Entity that registers the offloading constants that were emitted so
   /// far.
   class OffloadEntriesInfoManagerTy {
@@ -717,12 +698,6 @@ class CGOpenMPRuntime {
   /// Returns __tgt_offload_entry type.
   QualType getTgtOffloadEntryQTy();
 
-  /// Returns __tgt_device_image type.
-  QualType getTgtDeviceImageQTy();
-
-  /// Returns __tgt_bin_desc type.
-  QualType getTgtBinaryDescriptorQTy();
-
   /// Start scanning from statement \a S and and emit all target regions
   /// found along the way.
   /// \param S Starting statement.

From 667e1f71b83c48b635b13b64bbff28b95e68265c Mon Sep 17 00:00:00 2001
From: Tim Northover <t.p.northover@gmail.com>
Date: Thu, 9 Jan 2020 14:28:48 +0000
Subject: [PATCH 60/89] IR: remove "else" after "return". NFCI.

---
 llvm/lib/IR/Core.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 63b98f26ba193..04e34a90a9bcf 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -147,9 +147,9 @@ LLVMAttributeRef LLVMCreateEnumAttribute(LLVMContextRef C, unsigned KindID,
     // After r362128, byval attributes need to have a type attribute. Provide a
     // NULL one until a proper API is added for this.
     return wrap(Attribute::getWithByValType(Ctx, NULL));
-  } else {
-    return wrap(Attribute::get(Ctx, AttrKind, Val));
   }
+
+  return wrap(Attribute::get(Ctx, AttrKind, Val));
 }
 
 unsigned LLVMGetEnumAttributeKind(LLVMAttributeRef A) {

From d857e114b5e04f5143485a5aea7ad9b283768692 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 9 Jan 2020 10:49:41 +0000
Subject: [PATCH 61/89] [ARM,MVE] Fix valid immediate range for vsliq_n.

In common with most MVE immediate shift instructions, the left shift
takes an immediate in the range [0,n-1], while the right shift takes
one in the range [1,n]. I had absent-mindedly made them both the
latter.

While I'm here, I've added a set of regression tests checking both
ends of the immediate range for a representative sample of the
immediate shifts.
---
 clang/include/clang/Basic/arm_mve.td |  2 +-
 clang/test/Sema/arm-mve-immediates.c | 93 ++++++++++++++++++++++++++++
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td
index 87091a3250715..86a04e33ce760 100644
--- a/clang/include/clang/Basic/arm_mve.td
+++ b/clang/include/clang/Basic/arm_mve.td
@@ -684,7 +684,7 @@ let params = [s16, s32], pnt = PNT_NType in {
   defm vqrshrun : VSHRN<UHalfVector, imm_1toHalfN, (? 1,0,1,0)>;
 }
 let params = T.Int, pnt = PNT_NType in {
-  defm vsli : DyadicImmShift<Vector, imm_1toN>;
+  defm vsli : DyadicImmShift<Vector, imm_0toNm1>;
   defm vsri : DyadicImmShift<Vector, imm_1toN>;
 }
 
diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c
index 54cdb96efcd3b..b8106fbb70282 100644
--- a/clang/test/Sema/arm-mve-immediates.c
+++ b/clang/test/Sema/arm-mve-immediates.c
@@ -110,3 +110,96 @@ void test_lane_indices(uint8x16_t v16, uint16x8_t v8,
   vsetq_lane_u64(23, v2, 1);
   vsetq_lane_u64(23, v2, 2); // expected-error {{argument value 2 is outside the valid range [0, 1]}}
 }
+
+void test_immediate_shifts(uint8x16_t vb, uint16x8_t vh, uint32x4_t vw)
+{
+  vshlq_n(vb, 0);
+  vshlq_n(vb, 7);
+  vshlq_n(vh, 0);
+  vshlq_n(vh, 15);
+  vshlq_n(vw, 0);
+  vshlq_n(vw, 31);
+
+  vshlq_n(vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vshlq_n(vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vshlq_n(vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vshlq_n(vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vshlq_n(vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vshlq_n(vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vqshlq_n(vb, 0);
+  vqshlq_n(vb, 7);
+  vqshlq_n(vh, 0);
+  vqshlq_n(vh, 15);
+  vqshlq_n(vw, 0);
+  vqshlq_n(vw, 31);
+
+  vqshlq_n(vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vqshlq_n(vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vqshlq_n(vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vqshlq_n(vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vqshlq_n(vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vqshlq_n(vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vsliq(vb, vb, 0);
+  vsliq(vb, vb, 7);
+  vsliq(vh, vh, 0);
+  vsliq(vh, vh, 15);
+  vsliq(vw, vw, 0);
+  vsliq(vw, vw, 31);
+
+  vsliq(vb, vb, -1); // expected-error {{argument value -1 is outside the valid range [0, 7]}}
+  vsliq(vb, vb, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
+  vsliq(vh, vh, -1); // expected-error {{argument value -1 is outside the valid range [0, 15]}}
+  vsliq(vh, vh, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
+  vsliq(vw, vw, -1); // expected-error {{argument value -1 is outside the valid range [0, 31]}}
+  vsliq(vw, vw, 32); // expected-error {{argument value 32 is outside the valid range [0, 31]}}
+
+  vshllbq(vb, 1);
+  vshllbq(vb, 8);
+  vshllbq(vh, 1);
+  vshllbq(vh, 16);
+
+  vshllbq(vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshllbq(vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshllbq(vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshllbq(vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+
+  vshrq(vb, 1);
+  vshrq(vb, 8);
+  vshrq(vh, 1);
+  vshrq(vh, 16);
+  vshrq(vw, 1);
+  vshrq(vw, 32);
+
+  vshrq(vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshrq(vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshrq(vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshrq(vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+  vshrq(vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}}
+  vshrq(vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}}
+
+  vshrntq(vb, vh, 1);
+  vshrntq(vb, vh, 8);
+  vshrntq(vh, vw, 1);
+  vshrntq(vh, vw, 16);
+
+  vshrntq(vb, vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vshrntq(vb, vh, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vshrntq(vh, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vshrntq(vh, vw, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+
+  vsriq(vb, vb, 1);
+  vsriq(vb, vb, 8);
+  vsriq(vh, vh, 1);
+  vsriq(vh, vh, 16);
+  vsriq(vw, vw, 1);
+  vsriq(vw, vw, 32);
+
+  vsriq(vb, vb, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}}
+  vsriq(vb, vb, 9); // expected-error {{argument value 9 is outside the valid range [1, 8]}}
+  vsriq(vh, vh, 0); // expected-error {{argument value 0 is outside the valid range [1, 16]}}
+  vsriq(vh, vh, 17); // expected-error {{argument value 17 is outside the valid range [1, 16]}}
+  vsriq(vw, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}}
+  vsriq(vw, vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}}
+}

From 9704ba652a0062c53ec66b068766df5c0cd5c620 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 9 Jan 2020 11:23:20 +0000
Subject: [PATCH 62/89] [ARM,MVE] Add missing IntrNoMem flag on IR intrinsics.

A lot of the IR-level intrinsics we've been defining for MVE recently
accidentally had `props = []` instead of `props = [IntrNoMem]`, so
that optimization would have been overcautious about reordering them.

All the affected cases were due to instantiating the multiclasses
`MVEPredicated` and `MVEMXPredicated` without filling in the `props`
parameter, because I //thought// I remembered having set the defaults
in those multiclasses to `[IntrNoMem]`. In fact I hadn't done that.
Now I have.

(The IR intrinsics that //do// read and write memory are all
explicitly marked as `[IntrReadMem]` or `[IntrWriteMem]` already, so
they will override these defaults.)
---
 llvm/include/llvm/IR/IntrinsicsARM.td | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td
index 35d40c67b44f2..518ad7079225e 100644
--- a/llvm/include/llvm/IR/IntrinsicsARM.td
+++ b/llvm/include/llvm/IR/IntrinsicsARM.td
@@ -773,6 +773,9 @@ class Neon_Dot_Intrinsic
 def int_arm_neon_udot : Neon_Dot_Intrinsic;
 def int_arm_neon_sdot : Neon_Dot_Intrinsic;
 
+def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
+def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
+
 def int_arm_mve_vctp8  : Intrinsic<[llvm_v16i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_mve_vctp16 : Intrinsic<[llvm_v8i1_ty], [llvm_i32_ty], [IntrNoMem]>;
 def int_arm_mve_vctp32 : Intrinsic<[llvm_v4i1_ty], [llvm_i32_ty], [IntrNoMem]>;
@@ -881,7 +884,7 @@ defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty],
 
 multiclass MVEPredicated<list<LLVMType> rets, list<LLVMType> params,
                          LLVMType pred = llvm_anyvector_ty,
-                         list<IntrinsicProperty> props = []> {
+                         list<IntrinsicProperty> props = [IntrNoMem]> {
   def "": Intrinsic<rets, params, props>;
   def _predicated: Intrinsic<rets, params # [pred], props>;
 }
@@ -895,7 +898,7 @@ multiclass MVEPredicatedM<list<LLVMType> rets, list<LLVMType> params,
 }
 
 defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty],
-   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty, [IntrNoMem]>;
+   [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>;
 
 defm int_arm_mve_vldr_gather_base: MVEPredicated<
    [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty],
@@ -1033,7 +1036,7 @@ def int_arm_mve_vmull_poly: Intrinsic<
 multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
                            list<LLVMType> params, LLVMType inactive,
                            LLVMType predicate,
-                           list<IntrinsicProperty> props = []> {
+                           list<IntrinsicProperty> props = [IntrNoMem]> {
   def "":          Intrinsic<rets, flags # params, props>;
   def _predicated: Intrinsic<rets, flags # [inactive] # params # [predicate],
                              props>;
@@ -1047,7 +1050,7 @@ multiclass MVEMXPredicated<list<LLVMType> rets, list<LLVMType> flags,
 defm int_arm_mve_vcaddq : MVEMXPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty, llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-   LLVMMatchType<0>, llvm_anyvector_ty, [IntrNoMem]>;
+   LLVMMatchType<0>, llvm_anyvector_ty>;
 
 // The first operand of the following two intrinsics is the rotation angle
 // (must be a compile-time constant):
@@ -1058,12 +1061,12 @@ defm int_arm_mve_vcaddq : MVEMXPredicated<
 defm int_arm_mve_vcmulq : MVEMXPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty], [LLVMMatchType<0>, LLVMMatchType<0>],
-   LLVMMatchType<0>, llvm_anyvector_ty, [IntrNoMem]>;
+   LLVMMatchType<0>, llvm_anyvector_ty>;
 
 defm int_arm_mve_vcmlaq : MVEPredicated<
   [llvm_anyvector_ty],
   [llvm_i32_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-   llvm_anyvector_ty, [IntrNoMem]>;
+   llvm_anyvector_ty>;
 
 def int_arm_mve_vld2q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
 def int_arm_mve_vld4q: Intrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [llvm_anyptr_ty], [IntrReadMem]>;
@@ -1072,9 +1075,6 @@ def int_arm_mve_vst2q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMat
 def int_arm_mve_vst4q: Intrinsic<[], [llvm_anyptr_ty, llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>, llvm_i32_ty], [IntrWriteMem]
 >;
 
-def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
-
 // MVE vector absolute difference and accumulate across vector
 // The first operand is an 'unsigned' flag. The remaining operands are:
 // * accumulator
@@ -1083,8 +1083,7 @@ def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>;
 // * mask (only in predicated versions)
 defm int_arm_mve_vabav: MVEPredicated<
   [llvm_i32_ty],
-  [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty,
-  [IntrNoMem]>;
+  [llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty>;
 
 // The following 3 instrinsics are MVE vector reductions with two vector
 // operands.
@@ -1107,19 +1106,19 @@ defm int_arm_mve_vmldava: MVEPredicated<
   [llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 
 // Version with 64-bit result, vml{a,s}ldav[a][x]
 defm int_arm_mve_vmlldava: MVEPredicated<
   [llvm_i32_ty, llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 
 // Version with 72-bit rounded result, vrml{a,s}ldavh[a][x]
 defm int_arm_mve_vrmlldavha: MVEPredicated<
   [llvm_i32_ty, llvm_i32_ty],
   [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty,
    llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>],
-  llvm_anyvector_ty, [IntrNoMem]>;
+  llvm_anyvector_ty>;
 } // end TargetPrefix

From db7c92077963195df0807e976cc916b5c6e29a05 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 8 Jan 2020 14:12:19 -0500
Subject: [PATCH 63/89] AMDGPU: Add register class to DS_SWIZZLE_B32 pattern

Reduces diff for a future patch.
---
 llvm/lib/Target/AMDGPU/DSInstructions.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f008b800bd327..f0987cffd1065 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -619,7 +619,7 @@ def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
 
 def : GCNPat <
   (int_amdgcn_ds_swizzle i32:$src, timm:$offset16),
-  (DS_SWIZZLE_B32 $src, (as_i16imm $offset16), (i1 0))
+  (DS_SWIZZLE_B32 VGPR_32:$src, (as_i16imm $offset16), (i1 0))
 >;
 
 class DSReadPat <DS_Pseudo inst, ValueType vt, PatFrag frag, int gds=0> : GCNPat <

From 0274ed9dc75a0efb2b6130122226ee45f7e57dde Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 8 Jan 2020 18:57:44 -0500
Subject: [PATCH 64/89] TableGen/GlobalISel: Fix slightly wrong generated
 comment

---
 llvm/test/TableGen/GlobalISelEmitter.td   | 2 +-
 llvm/utils/TableGen/GlobalISelEmitter.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/test/TableGen/GlobalISelEmitter.td b/llvm/test/TableGen/GlobalISelEmitter.td
index 41c825950ca2f..59816497cc937 100644
--- a/llvm/test/TableGen/GlobalISelEmitter.td
+++ b/llvm/test/TableGen/GlobalISelEmitter.td
@@ -210,7 +210,7 @@ def HasC : Predicate<"Subtarget->hasC()"> { let RecomputePerFunction = 1; }
 // CHECK-NEXT: };
 // CHECK-NEXT: MyTargetInstructionSelector::CustomRendererFn
 // CHECK-NEXT: MyTargetInstructionSelector::CustomRenderers[] = {
-// CHECK-NEXT:   nullptr, // GICP_Invalid
+// CHECK-NEXT:   nullptr, // GICR_Invalid
 // CHECK-NEXT:   &MyTargetInstructionSelector::renderImm8, // gi_cimm8
 // CHECK-NEXT: };
 
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index b2e3903eda8b5..99b067d5b5270 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -5283,7 +5283,7 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
   OS << Target.getName() << "InstructionSelector::CustomRendererFn\n"
      << Target.getName() << "InstructionSelector::CustomRenderers[] = {\n"
-     << "  nullptr, // GICP_Invalid\n";
+     << "  nullptr, // GICR_Invalid\n";
   for (const auto &Record : CustomRendererFns)
     OS << "  &" << Target.getName()
        << "InstructionSelector::" << Record->getValueAsString("RendererFn")

From 3952748ffdf017f83faddcb1240cb36cb4bb9c5b Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 11:29:05 -0500
Subject: [PATCH 65/89] AMDGPU/GlobalISel: Fix add of neg inline constant
 pattern

---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   3 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   6 +
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   3 +
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  15 ++-
 .../AMDGPU/GlobalISel/inst-select-add.mir     | 113 ++++++++++++++++++
 5 files changed, 139 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d7c211f1ed930..0583610143188 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -204,3 +204,6 @@ foreach Ty = [i64, p0, p1, p4] in {
 
 def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
   GISDNodeXFormEquiv<as_i32timm>;
+
+def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
+  GISDNodeXFormEquiv<NegateImm>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a41c8f1a6a3f3..a632e7aece1e9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2097,6 +2097,12 @@ void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
   MIB.addImm(CstVal.getValue());
 }
 
+void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 45782ab3185ef..8d44c58886435 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -169,6 +169,9 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   void renderTruncImm32(MachineInstrBuilder &MIB,
                         const MachineInstr &MI) const;
 
+  void renderNegateImm(MachineInstrBuilder &MIB,
+                       const MachineInstr &MI) const;
+
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
   bool isInlineImmediate64(int64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index dcc139a9fe943..4c8197975ce68 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1930,9 +1930,22 @@ def : GCNPat <
 // TODO: Also do for 64-bit.
 def : GCNPat<
   (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
-  (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
+  (S_SUB_I32 SReg_32:$src0, NegSubInlineConst32:$src1)
 >;
 
+def : GCNPat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (V_SUB_U32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+  let SubtargetPredicate = HasAddNoCarryInsts;
+}
+
+def : GCNPat<
+  (add i32:$src0, (i32 NegSubInlineConst32:$src1)),
+  (V_SUB_I32_e64 VS_32:$src0, NegSubInlineConst32:$src1)> {
+  let SubtargetPredicate = NotHasAddNoCarryInsts;
+}
+
+
 // Avoid pointlessly materializing a constant in VGPR.
 // FIXME: Should also do this for readlane, but tablegen crashes on
 // the ignored src1.
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
index 43931132107aa..b0f9cc52ae144 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.mir
@@ -53,3 +53,116 @@ body: |
     S_ENDPGM 0, implicit %9
 
 ...
+
+---
+name:            add_neg_inline_const_64_to_sub_s32_s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_s
+    ; GFX6: liveins: $sgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], 64, implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_SUB_I32_]]
+    ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_s
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[COPY]], 64, implicit-def $scc
+    ; GFX9: S_ENDPGM 0, implicit [[S_SUB_I32_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 -64
+    %2:sgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_64_to_sub_s32_v
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_64_to_sub_s32_v
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 4294967232, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit %2
+    ; GFX9-LABEL: name: add_neg_inline_const_64_to_sub_s32_v
+    ; GFX9: liveins: $vgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[V_SUB_U32_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U32_e64 [[COPY]], 64, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_SUB_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 -64
+    %2:vgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_16_to_sub_s32_s
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_16_to_sub_s32_s
+    ; GFX6: liveins: $sgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; GFX6: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_ADD_U32_]]
+    ; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_s
+    ; GFX9: liveins: $sgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX9: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 16
+    ; GFX9: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[S_MOV_B32_]], implicit-def $scc
+    ; GFX9: S_ENDPGM 0, implicit [[S_ADD_U32_]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = G_CONSTANT i32 16
+    %2:sgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...
+
+---
+name:            add_neg_inline_const_16_to_sub_s32_v
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_neg_inline_const_16_to_sub_s32_v
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GFX6: %2:vgpr_32, dead %3:sreg_64_xexec = V_ADD_I32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit %2
+    ; GFX9-LABEL: name: add_neg_inline_const_16_to_sub_s32_v
+    ; GFX9: liveins: $vgpr0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 16, implicit $exec
+    ; GFX9: [[V_ADD_U32_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e64 [[COPY]], [[V_MOV_B32_e32_]], 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_ADD_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = G_CONSTANT i32 16
+    %2:vgpr(s32) = G_ADD %0, %1
+    S_ENDPGM 0, implicit %2
+
+...

From d964086c62422771c1d6dbe66ee8ea06e8f834b2 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 11:45:10 -0500
Subject: [PATCH 66/89] AMDGPU/GlobalISel: Add equiv xform for
 bitcast_fpimm_to_i32

Only partially fixes one pattern import.
---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td                |  3 +++
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 11 +++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h   |  3 +++
 3 files changed, 17 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 0583610143188..b198bfb0e4115 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -207,3 +207,6 @@ def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
 
 def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
   GISDNodeXFormEquiv<NegateImm>;
+
+def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
+  GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index a632e7aece1e9..876c7c35294df 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2103,6 +2103,17 @@ void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
   MIB.addImm(-MI.getOperand(1).getCImm()->getSExtValue());
 }
 
+void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI) const {
+  const MachineOperand &Op = MI.getOperand(1);
+  if (MI.getOpcode() == TargetOpcode::G_FCONSTANT)
+    MIB.addImm(Op.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
+  else {
+    assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+    MIB.addImm(Op.getCImm()->getSExtValue());
+  }
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 8d44c58886435..1de8a0abbc1bb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -172,6 +172,9 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   void renderNegateImm(MachineInstrBuilder &MIB,
                        const MachineInstr &MI) const;
 
+  void renderBitcastImm(MachineInstrBuilder &MIB,
+                        const MachineInstr &MI) const;
+
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
   bool isInlineImmediate64(int64_t Imm) const;

From 79450a4ea26a0e9731eaf2629f6dd8c1ffd8f407 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 12:21:18 -0500
Subject: [PATCH 67/89] AMDGPU/GlobalISel: Add selectVOP3Mods_nnan

This doesn't enable any new imports yet, but moves the fmed patterns
from failing on this to hitting the "complex suboperand referenced
more than once" limitation in tablegen.
---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td              |  4 ++++
 .../Target/AMDGPU/AMDGPUInstructionSelector.cpp    | 14 ++++++++++++++
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h |  2 ++
 3 files changed, 20 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index b198bfb0e4115..9b62e54da2b1f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -30,6 +30,10 @@ def gi_vop3mods :
     GIComplexOperandMatcher<s32, "selectVOP3Mods">,
     GIComplexPatternEquiv<VOP3Mods>;
 
+def gi_vop3mods_nnan :
+    GIComplexOperandMatcher<s32, "selectVOP3Mods_nnan">,
+    GIComplexPatternEquiv<VOP3Mods_nnan>;
+
 def gi_vop3omods :
     GIComplexOperandMatcher<s32, "selectVOP3OMods">,
     GIComplexPatternEquiv<VOP3OMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 876c7c35294df..c0830ac2a0b01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1770,6 +1770,20 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
   }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+    return None;
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
   // FIXME: Handle clamp and op_sel
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 1de8a0abbc1bb..c249c7e44e66f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -132,6 +132,8 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   selectVOP3OMods(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectVOP3Mods_nnan(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods0(MachineOperand &Root) const;

From e71af775684a83f0d1d05ab5225d36830d5aa87e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 12:38:35 -0500
Subject: [PATCH 68/89] AMDGPU/GlobalISel: Add IMMPopCount xform

Partially fixes BFE pattern import.
---
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td                | 3 +++
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 6 ++++++
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h   | 3 +++
 3 files changed, 12 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 9b62e54da2b1f..f9983693a99ec 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -214,3 +214,6 @@ def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
 
 def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
   GISDNodeXFormEquiv<bitcast_fpimm_to_i32>;
+
+def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
+  GISDNodeXFormEquiv<IMMPopCount>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c0830ac2a0b01..095c667e14340 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -2128,6 +2128,12 @@ void AMDGPUInstructionSelector::renderBitcastImm(MachineInstrBuilder &MIB,
   }
 }
 
+void AMDGPUInstructionSelector::renderPopcntImm(MachineInstrBuilder &MIB,
+                                                const MachineInstr &MI) const {
+  assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && "Expected G_CONSTANT");
+  MIB.addImm(MI.getOperand(1).getCImm()->getValue().countPopulation());
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index c249c7e44e66f..0799ace086756 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -177,6 +177,9 @@ class AMDGPUInstructionSelector : public InstructionSelector {
   void renderBitcastImm(MachineInstrBuilder &MIB,
                         const MachineInstr &MI) const;
 
+  void renderPopcntImm(MachineInstrBuilder &MIB,
+                       const MachineInstr &MI) const;
+
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
   bool isInlineImmediate64(int64_t Imm) const;

From 7d677421607cbfdd8d1e96275c613d3db8a0e51f Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 13:10:08 -0500
Subject: [PATCH 69/89] AMDGPU/GlobalISel: Fix import of zext of s16 op
 patterns

---
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   2 +
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   6 +-
 .../AMDGPU/GlobalISel/inst-select-add.s16.mir | 132 ++++++++++++++++++
 .../GlobalISel/inst-select-ashr.s16.mir       |   6 +-
 .../GlobalISel/inst-select-lshr.s16.mir       |   6 +-
 .../AMDGPU/GlobalISel/inst-select-shl.s16.mir |   6 +-
 6 files changed, 143 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 095c667e14340..132c51c9e08f1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -1684,6 +1684,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
+    if (selectImpl(I, *CoverageInfo))
+      return true;
     return selectG_SZA_EXT(I);
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 064b26665542f..aaadc3dbc7215 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -729,7 +729,7 @@ multiclass Arithmetic_i16_0Hi_Pats <SDPatternOperator op, Instruction inst> {
 
 def : GCNPat<
   (i32 (zext (op i16:$src0, i16:$src1))),
-  (inst $src0, $src1)
+  (inst VSrc_b16:$src0, VSrc_b16:$src1)
 >;
 
 def : GCNPat<
@@ -771,7 +771,7 @@ let Predicates = [Has16BitInsts] in {
 // TODO: Also do for 64-bit.
 def : GCNPat<
   (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
-  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
 >;
 
 
@@ -779,7 +779,7 @@ let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
 
 def : GCNPat<
   (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
-  (V_SUB_U16_e64 $src0, NegSubInlineConst16:$src1)
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
 >;
 
 defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
new file mode 100644
index 0000000000000..cc48e9126c9b7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -0,0 +1,132 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
+# RUN: llc -march=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+
+# Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+
+
+---
+name:            add_s16
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: add_s16
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16
+    ; GFX10: liveins: $vgpr0, $vgpr1
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vgpr(s16) = G_ADD %2, %3
+    S_ENDPGM 0, implicit %4
+
+...
+
+---
+name:            add_s16_zext_to_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX6-LABEL: name: add_s16_zext_to_s32
+    ; GFX6: liveins: $vgpr0, $vgpr1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_zext_to_s32
+    ; GFX10: liveins: $vgpr0, $vgpr1
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX10: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ADD_U16_e64_]], 0, 16, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s16) = G_TRUNC %0
+    %3:vgpr(s16) = G_TRUNC %1
+    %4:vgpr(s16) = G_ADD %2, %3
+    %5:vgpr(s32) = G_ZEXT %4
+    S_ENDPGM 0, implicit %5
+
+...
+
+---
+name:            add_s16_neg_inline_const_64
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX10: liveins: $vgpr0
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s16) = G_TRUNC %0
+    %2:vgpr(s16) = G_CONSTANT i16 -64
+    %3:vgpr(s16) = G_ADD %1, %2
+    S_ENDPGM 0, implicit %3
+
+...
+
+---
+name:            add_s16_neg_inline_const_64_zext_to_s32
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX6-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX6: liveins: $vgpr0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX10: liveins: $vgpr0
+    ; GFX10: $vcc_hi = IMPLICIT_DEF
+    ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX10: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, implicit $exec
+    ; GFX10: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_SUB_U16_e64_]], 0, 16, implicit $exec
+    ; GFX10: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s16) = G_TRUNC %0
+    %2:vgpr(s16) = G_CONSTANT i16 -64
+    %3:vgpr(s16) = G_ADD %1, %2
+    %4:vgpr(s32) = G_ZEXT %3
+    S_ENDPGM 0, implicit %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
index b3402a6051488..8d3d677b3c007 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ashr.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
     ; GFX9-LABEL: name: ashr_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_ASHRREV_I16_e64_:%[0-9]+]]:vgpr_32 = V_ASHRREV_I16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_ASHRREV_I16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_ASHRREV_I16_e64_]]
     ; GFX10-LABEL: name: ashr_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
index 866ab39fe2d19..f28e35669357b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-lshr.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
     ; GFX9-LABEL: name: lshr_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_LSHRREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHRREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_LSHRREV_B16_e64_]]
     ; GFX10-LABEL: name: lshr_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
index 4321e4e7ca214..3085bb7201513 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-shl.s16.mir
@@ -168,14 +168,12 @@ body: |
     ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX8: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX8: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX8: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX8: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
     ; GFX9-LABEL: name: shl_s16_s16_vv_zext_to_s32
     ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX9: [[V_LSHLREV_B16_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B16_e64 [[COPY1]], [[COPY]], implicit $exec
-    ; GFX9: [[V_BFE_U32_:%[0-9]+]]:vgpr_32 = V_BFE_U32 [[V_LSHLREV_B16_e64_]], 0, 16, implicit $exec
-    ; GFX9: S_ENDPGM 0, implicit [[V_BFE_U32_]]
+    ; GFX9: S_ENDPGM 0, implicit [[V_LSHLREV_B16_e64_]]
     ; GFX10-LABEL: name: shl_s16_s16_vv_zext_to_s32
     ; GFX10: $vcc_hi = IMPLICIT_DEF
     ; GFX10: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0

From c1d4963b447c9330c2ad50bb73bb93f9a42c9641 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 16:48:05 -0500
Subject: [PATCH 70/89] AMDGPU: Use new PatFrag system for d16 load nodes

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td | 55 +++++++++++----------------
 1 file changed, 23 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 609a345ea18ca..1518beafc7aba 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -579,46 +579,37 @@ def si_setcc_uniform : PatFrag <
 // SDNodes PatFrags for d16 loads
 //===----------------------------------------------------------------------===//
 
-class LoadD16Frag <SDPatternOperator op> : PatFrag<(ops node:$ptr, node:$tied_in), (op node:$ptr, node:$tied_in)>;
-class LocalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, LocalAddress;
-class GlobalLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, GlobalLoadAddress;
-class PrivateLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, PrivateAddress;
-class FlatLoadD16 <SDPatternOperator op> : LoadD16Frag <op>, FlatLoadAddress;
-
-def load_d16_hi_local : LocalLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_local : LocalLoadD16 <SIload_d16_hi_i8>;
-
-def load_d16_hi_global : GlobalLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_global : GlobalLoadD16 <SIload_d16_hi_i8>;
-
-def load_d16_hi_private : PrivateLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_private : PrivateLoadD16 <SIload_d16_hi_i8>;
+class LoadD16Frag <SDPatternOperator op> : PatFrag<
+  (ops node:$ptr, node:$tied_in),
+  (op node:$ptr, node:$tied_in)> {
+  let IsLoad = 1;
+}
 
-def load_d16_hi_flat : FlatLoadD16 <SIload_d16_hi>;
-def az_extloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_u8>;
-def sextloadi8_d16_hi_flat : FlatLoadD16 <SIload_d16_hi_i8>;
+foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
+def load_d16_hi_#as : LoadD16Frag <SIload_d16_hi>;
 
-def load_d16_lo_local : LocalLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_local : LocalLoadD16 <SIload_d16_lo_i8>;
+def az_extloadi8_d16_hi_#as : LoadD16Frag <SIload_d16_hi_u8> {
+  let MemoryVT = i8;
+}
 
-def load_d16_lo_global : GlobalLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_global : GlobalLoadD16 <SIload_d16_lo_i8>;
+def sextloadi8_d16_hi_#as : LoadD16Frag <SIload_d16_hi_i8> {
+  let MemoryVT = i8;
+}
 
-def load_d16_lo_private : PrivateLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_private : PrivateLoadD16 <SIload_d16_lo_i8>;
+def load_d16_lo_#as : LoadD16Frag <SIload_d16_lo>;
 
-def load_d16_lo_flat : FlatLoadD16 <SIload_d16_lo>;
-def az_extloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_u8>;
-def sextloadi8_d16_lo_flat : FlatLoadD16 <SIload_d16_lo_i8>;
+def az_extloadi8_d16_lo_#as : LoadD16Frag <SIload_d16_lo_u8> {
+  let MemoryVT = i8;
+}
 
+def sextloadi8_d16_lo_#as : LoadD16Frag <SIload_d16_lo_i8> {
+  let MemoryVT = i8;
+}
 
+} // End let AddressSpaces = ...
+} // End foreach AddrSpace
 
 def lshr_rev : PatFrag <
   (ops node:$src1, node:$src0),

From 3766f4baccac5cc17680ad4cefd1d5a0d3ba2870 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 17:46:52 -0500
Subject: [PATCH 71/89] AMDGPU: Use new PatFrag system for d16 stores

---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 20 +++++++-------------
 llvm/lib/Target/AMDGPU/DSInstructions.td     |  4 ++--
 2 files changed, 9 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 89e52d63af2d2..cc006f38fe579 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -343,8 +343,9 @@ class StoreFrag<SDPatternOperator op> : PatFrag <
 >;
 
 class StoreHi16<SDPatternOperator op> : PatFrag <
-  (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)
->;
+  (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
+  let IsStore = 1;
+}
 
 def LoadAddress_constant : AddressSpaceList<[  AddrSpaces.Constant ]>;
 def LoadAddress_global : AddressSpaceList<[  AddrSpaces.Global, AddrSpaces.Constant ]>;
@@ -485,6 +486,10 @@ def truncstorei16_#as : PatFrag<(ops node:$val, node:$ptr),
   let MemoryVT = i16;
 }
 
+def store_hi16_#as : StoreHi16 <truncstorei16>;
+def truncstorei8_hi16_#as : StoreHi16<truncstorei8>;
+def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
+
 defm atomic_store_#as : binary_atomic_op<atomic_store>;
 
 } // End let AddressSpaces = ...
@@ -520,16 +525,7 @@ defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
 defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
 defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
 
-
-def store_hi16_private : StoreHi16 <truncstorei16>, PrivateAddress;
-def truncstorei8_hi16_private : StoreHi16<truncstorei8>, PrivateAddress;
-
 def store_atomic_global : GlobalStore<atomic_store>;
-def truncstorei8_hi16_global : StoreHi16 <truncstorei8>, GlobalAddress;
-def truncstorei16_hi16_global : StoreHi16 <truncstorei16>, GlobalAddress;
-
-def store_local_hi16 : StoreHi16 <truncstorei16>, LocalAddress;
-def truncstorei8_local_hi16 : StoreHi16<truncstorei8>, LocalAddress;
 def atomic_store_local : LocalStore <atomic_store>;
 
 
@@ -559,8 +555,6 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
 
 
 def atomic_store_flat  : FlatStore <atomic_store>;
-def truncstorei8_hi16_flat  : StoreHi16<truncstorei8>, FlatStoreAddress;
-def truncstorei16_hi16_flat : StoreHi16<truncstorei16>, FlatStoreAddress;
 
 
 class local_binary_atomic_op<SDNode atomic_op> :
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index f0987cffd1065..f4e50e3a15e9a 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -733,8 +733,8 @@ defm : DSAtomicWritePat_mc <DS_WRITE_B32, i32, "atomic_store_local_32">;
 defm : DSAtomicWritePat_mc <DS_WRITE_B64, i64, "atomic_store_local_64">;
 
 let OtherPredicates = [D16PreservesUnusedBits] in {
-def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_local_hi16>;
-def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_local_hi16>;
+def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
+def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
 

From c66b2e1c87ecde72eb37d3452ec9c1b8766ede30 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 18:52:18 -0500
Subject: [PATCH 72/89] AMDGPU: Eliminate more legacy codepred address space
 PatFrags

These should now be limited to R600 code.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td | 84 --------------------
 llvm/lib/Target/AMDGPU/BUFInstructions.td    |  4 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td   |  4 +-
 llvm/lib/Target/AMDGPU/R600Instructions.td   | 25 ++++--
 4 files changed, 24 insertions(+), 93 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index cc006f38fe579..6541470f06a24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -336,12 +336,6 @@ class Aligned<int Bytes> {
   int MinAlignment = Bytes;
 }
 
-class LoadFrag <SDPatternOperator op> : PatFrag<(ops node:$ptr), (op node:$ptr)>;
-
-class StoreFrag<SDPatternOperator op> : PatFrag <
-  (ops node:$value, node:$ptr), (op node:$value, node:$ptr)
->;
-
 class StoreHi16<SDPatternOperator op> : PatFrag <
   (ops node:$value, node:$ptr), (op (srl node:$value, (i32 16)), node:$ptr)> {
   let IsStore = 1;
@@ -367,48 +361,6 @@ def StoreAddress_region : AddressSpaceList<[ AddrSpaces.Region ]>;
 
 
 
-class GlobalLoadAddress : CodePatPred<[{
-  auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::GLOBAL_ADDRESS || AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-class FlatLoadAddress : CodePatPred<[{
-  const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
-}]>;
-
-class GlobalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-class PrivateAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
-}]>;
-
-class LocalAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class RegionAddress : CodePatPred<[{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
-
-class FlatStoreAddress : CodePatPred<[{
-  const auto AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
-// TODO: Remove these when stores to new PatFrag format.
-class PrivateStore <SDPatternOperator op> : StoreFrag <op>, PrivateAddress;
-class LocalStore <SDPatternOperator op> : StoreFrag <op>, LocalAddress;
-class RegionStore <SDPatternOperator op> : StoreFrag <op>, RegionAddress;
-class GlobalStore <SDPatternOperator op> : StoreFrag<op>, GlobalAddress;
-class FlatStore <SDPatternOperator op> : StoreFrag <op>, FlatStoreAddress;
-
-
 foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
 let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
@@ -525,9 +477,6 @@ defm atomic_load_xor : ret_noret_binary_atomic_op<atomic_load_xor>;
 defm atomic_load_fadd : ret_noret_binary_atomic_op<atomic_load_fadd, 0>;
 defm AMDGPUatomic_cmp_swap : ret_noret_binary_atomic_op<AMDGPUatomic_cmp_swap>;
 
-def store_atomic_global : GlobalStore<atomic_store>;
-def atomic_store_local : LocalStore <atomic_store>;
-
 
 def load_align8_local : PatFrag <(ops node:$ptr), (load_local node:$ptr)> {
   let IsLoad = 1;
@@ -553,28 +502,6 @@ def store_align16_local: PatFrag<(ops node:$val, node:$ptr),
   let IsTruncStore = 0;
 }
 
-
-def atomic_store_flat  : FlatStore <atomic_store>;
-
-
-class local_binary_atomic_op<SDNode atomic_op> :
-  PatFrag<(ops node:$ptr, node:$value),
-    (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
-}]>;
-
-class region_binary_atomic_op<SDNode atomic_op> :
-  PatFrag<(ops node:$ptr, node:$value),
-    (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
-}]>;
-
-
-def mskor_global : PatFrag<(ops node:$val, node:$ptr),
-                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
-}]>;
-
 let AddressSpaces = StoreAddress_local.AddrSpaces in {
 defm atomic_cmp_swap_local : ternary_atomic_op<atomic_cmp_swap>;
 defm atomic_cmp_swap_local_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
@@ -585,17 +512,6 @@ defm atomic_cmp_swap_region : ternary_atomic_op<atomic_cmp_swap>;
 defm atomic_cmp_swap_region_m0 : ternary_atomic_op<atomic_cmp_swap_glue>;
 }
 
-// Legacy.
-def atomic_cmp_swap_global_noret : PatFrag<
-  (ops node:$ptr, node:$cmp, node:$value),
-  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
-
-def atomic_cmp_swap_global_ret : PatFrag<
-  (ops node:$ptr, node:$cmp, node:$value),
-  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
-
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 1b12550aed88f..691aff4ecbb8a 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1621,8 +1621,8 @@ multiclass MUBUFStore_Atomic_Pattern <MUBUF_Pseudo Instr_ADDR64, MUBUF_Pseudo In
   >;
 }
 let SubtargetPredicate = isGFX6GFX7 in {
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, store_atomic_global>;
-defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, store_atomic_global>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORD_ADDR64, BUFFER_STORE_DWORD_OFFSET, i32, atomic_store_global_32>;
+defm : MUBUFStore_Atomic_Pattern <BUFFER_STORE_DWORDX2_ADDR64, BUFFER_STORE_DWORDX2_OFFSET, i64, atomic_store_global_64>;
 } // End Predicates = isGFX6GFX7
 
 
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index e106af42deddb..2057cac346d45 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -896,8 +896,8 @@ def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2i16>;
 def : FlatSignedLoadPat_D16 <GLOBAL_LOAD_SHORT_D16, load_d16_lo_global, v2f16>;
 }
 
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, store_atomic_global, i32>;
-def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, store_atomic_global, i64, VReg_64>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORD, atomic_store_global_32, i32>;
+def : FlatStoreSignedAtomicPat <GLOBAL_STORE_DWORDX2, atomic_store_global_64, i64, VReg_64>;
 
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_RTN, atomic_load_add_global_32, i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_RTN, atomic_load_sub_global_32, i32>;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index f40eece859ee7..cbdf0de44f873 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -295,9 +295,23 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
   let VTXInst = 1;
 }
 
-// FIXME: Deprecated.
-class LocalLoad <SDPatternOperator op> : LoadFrag <op>, LocalAddress;
+// Legacy.
+def atomic_cmp_swap_global_noret : PatFrag<
+  (ops node:$ptr, node:$cmp, node:$value),
+  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+
+def atomic_cmp_swap_global_ret : PatFrag<
+  (ops node:$ptr, node:$cmp, node:$value),
+  (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+
+def mskor_global : PatFrag<(ops node:$val, node:$ptr),
+                            (AMDGPUstore_mskor node:$val, node:$ptr), [{
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+}]>;
 
+// FIXME: These are deprecated
 class AZExtLoadBase <SDPatternOperator ld_node>: PatFrag<(ops node:$ptr),
                                               (ld_node node:$ptr), [{
   LoadSDNode *L = cast<LoadSDNode>(N);
@@ -319,9 +333,10 @@ def az_extloadi32 : PatFrag<(ops node:$ptr), (az_extload node:$ptr), [{
   return cast<LoadSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
-// FIXME: These are deprecated
-def az_extloadi8_local : LocalLoad <az_extloadi8>;
-def az_extloadi16_local : LocalLoad <az_extloadi16>;
+let AddressSpaces = LoadAddress_local.AddrSpaces in {
+def az_extloadi8_local : PatFrag<(ops node:$ptr), (az_extloadi8 node:$ptr)>;
+def az_extloadi16_local : PatFrag<(ops node:$ptr), (az_extloadi16 node:$ptr)>;
+}
 
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),

From 9ffd0ed838191247e0da7df5e28e54a5129e76a7 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 7 Jan 2020 20:22:06 -0500
Subject: [PATCH 73/89] AMDGPU/GlobalISel: Fix import of integer med3

This isn't too useful now, since nothing is currently trying to form
min/max from cmp+select.
---
 llvm/lib/Target/AMDGPU/AMDGPUInstructions.td  |  24 ---
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  38 +++-
 .../GlobalISel/inst-select-pattern-smed3.mir  | 140 +++++++++++++++
 .../inst-select-pattern-smed3.s16.mir         | 168 ++++++++++++++++++
 .../GlobalISel/inst-select-pattern-umed3.mir  | 140 +++++++++++++++
 .../inst-select-pattern-umed3.s16.mir         | 168 ++++++++++++++++++
 6 files changed, 646 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 6541470f06a24..7e71dbdd12408 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -737,30 +737,6 @@ class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (BIT_ALIGN $src0, $src0, $src1)
 >;
 
-multiclass IntMed3Pat<Instruction med3Inst,
-                 SDPatternOperator min,
-                 SDPatternOperator max,
-                 SDPatternOperator min_oneuse,
-                 SDPatternOperator max_oneuse,
-                 ValueType vt = i32> {
-
-  // This matches 16 permutations of
-  // min(max(a, b), max(min(a, b), c))
-  def : AMDGPUPat <
-  (min (max_oneuse vt:$src0, vt:$src1),
-       (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst vt:$src0, vt:$src1, vt:$src2)
->;
-
-  // This matches 16 permutations of
-  // max(min(x, y), min(max(x, y), z))
-  def : AMDGPUPat <
-  (max (min_oneuse vt:$src0, vt:$src1),
-       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst $src0, $src1, $src2)
->;
-}
-
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 4c8197975ce68..d84720f820ee3 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1972,6 +1972,29 @@ defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 defm : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64, SReg_64>;
 
+multiclass IntMed3Pat<Instruction med3Inst,
+                 SDPatternOperator min,
+                 SDPatternOperator max,
+                 SDPatternOperator min_oneuse,
+                 SDPatternOperator max_oneuse> {
+
+  // This matches 16 permutations of
+  // min(max(a, b), max(min(a, b), c))
+  def : AMDGPUPat <
+  (min (max_oneuse i32:$src0, i32:$src1),
+       (max_oneuse (min_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+
+  // This matches 16 permutations of
+  // max(min(x, y), min(max(x, y), z))
+  def : AMDGPUPat <
+  (max (min_oneuse i32:$src0, i32:$src1),
+       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+  (med3Inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
+>;
+}
+
 defm : IntMed3Pat<V_MED3_I32, smin, smax, smin_oneuse, smax_oneuse>;
 defm : IntMed3Pat<V_MED3_U32, umin, umax, umin_oneuse, umax_oneuse>;
 
@@ -2002,22 +2025,21 @@ multiclass Int16Med3Pat<Instruction med3Inst,
                    SDPatternOperator min,
                    SDPatternOperator max,
                    SDPatternOperator max_oneuse,
-                   SDPatternOperator min_oneuse,
-                   ValueType vt = i16> {
+                   SDPatternOperator min_oneuse> {
   // This matches 16 permutations of
   // max(min(x, y), min(max(x, y), z))
   def : GCNPat <
-  (max (min_oneuse vt:$src0, vt:$src1),
-       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+  (max (min_oneuse i16:$src0, i16:$src1),
+       (min_oneuse (max_oneuse i16:$src0, i16:$src1), i16:$src2)),
+  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
 >;
 
   // This matches 16 permutations of
   // min(max(a, b), max(min(a, b), c))
   def : GCNPat <
-  (min (max_oneuse vt:$src0, vt:$src1),
-      (max_oneuse (min_oneuse vt:$src0, vt:$src1), vt:$src2)),
-  (med3Inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2, DSTCLAMP.NONE)
+  (min (max_oneuse i16:$src0, i16:$src1),
+      (max_oneuse (min_oneuse i16:$src0, i16:$src1), i16:$src2)),
+  (med3Inst SRCMODS.NONE, VSrc_b16:$src0, SRCMODS.NONE, VSrc_b16:$src1, SRCMODS.NONE, VSrc_b16:$src2, DSTCLAMP.NONE)
 >;
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir
new file mode 100644
index 0000000000000..b09abd4be1ad7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
+
+---
+name: smed3_s32_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MED3_I32_:%[0-9]+]]:vgpr_32 = V_MED3_I32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MED3_I32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+
+name: smed3_s32_sss
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_sss
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+    ; GFX6: [[S_MAX_I32_:%[0-9]+]]:sreg_32 = S_MAX_I32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MIN_I32_:%[0-9]+]]:sreg_32 = S_MIN_I32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MAX_I32_1:%[0-9]+]]:sreg_32 = S_MAX_I32 [[S_MIN_I32_]], [[COPY2]], implicit-def $scc
+    ; GFX6: [[S_MIN_I32_1:%[0-9]+]]:sreg_32 = S_MIN_I32 [[S_MAX_I32_]], [[S_MAX_I32_1]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_MIN_I32_1]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s32) = COPY $sgpr2
+    %3:sgpr(s32) = G_SMAX %0, %1
+    %4:sgpr(s32) = G_SMIN %0, %1
+    %5:sgpr(s32) = G_SMAX %4, %2
+    %6:sgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+name: smed3_s32_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MAX_I32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %3
+...
+
+---
+name: smed3_s32_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MIN_I32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %4
+...
+
+---
+name: smed3_s32_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: smed3_s32_vvv_multiuse2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_I32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I32_e64 [[V_MIN_I32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_I32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I32_e64 [[V_MAX_I32_e64_]], [[V_MAX_I32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_I32_e64_1]], implicit [[V_MAX_I32_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_SMAX %0, %1
+    %4:vgpr(s32) = G_SMIN %0, %1
+    %5:vgpr(s32) = G_SMAX %4, %2
+    %6:vgpr(s32) = G_SMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
new file mode 100644
index 0000000000000..9e029ee5e066c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-smed3.s16.mir
@@ -0,0 +1,168 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+
+---
+name: smed3_s16_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]]
+    ; GFX9-LABEL: name: smed3_s16_vvv
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MED3_I16_:%[0-9]+]]:vgpr_32 = V_MED3_I16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MED3_I16_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9
+...
+
+---
+name: smed3_s16_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse0
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %6
+...
+
+---
+name: smed3_s16_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse1
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MIN_I16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %7
+...
+
+---
+name: smed3_s16_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: smed3_s16_vvv_multiuse2
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]]
+    ; GFX9-LABEL: name: smed3_s16_vvv_multiuse2
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_I16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_I16_e64 [[V_MIN_I16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_I16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_I16_e64 [[V_MAX_I16_e64_]], [[V_MAX_I16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_I16_e64_1]], implicit [[V_MAX_I16_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_SMAX %3, %4
+    %7:vgpr(s16) = G_SMIN %3, %4
+    %8:vgpr(s16) = G_SMAX %7, %5
+    %9:vgpr(s16) = G_SMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %8
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir
new file mode 100644
index 0000000000000..a8341251faf64
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.mir
@@ -0,0 +1,140 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=tahiti -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX6 %s
+
+---
+name: umed3_s32_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MED3_U32_:%[0-9]+]]:vgpr_32 = V_MED3_U32 [[COPY]], [[COPY1]], [[COPY2]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MED3_U32_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+
+name: umed3_s32_sss
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_sss
+    ; GFX6: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:sreg_32 = COPY $sgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:sreg_32 = COPY $sgpr2
+    ; GFX6: [[S_MAX_U32_:%[0-9]+]]:sreg_32 = S_MAX_U32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MIN_U32_:%[0-9]+]]:sreg_32 = S_MIN_U32 [[COPY]], [[COPY1]], implicit-def $scc
+    ; GFX6: [[S_MAX_U32_1:%[0-9]+]]:sreg_32 = S_MAX_U32 [[S_MIN_U32_]], [[COPY2]], implicit-def $scc
+    ; GFX6: [[S_MIN_U32_1:%[0-9]+]]:sreg_32 = S_MIN_U32 [[S_MAX_U32_]], [[S_MAX_U32_1]], implicit-def $scc
+    ; GFX6: S_ENDPGM 0, implicit [[S_MIN_U32_1]]
+    %0:sgpr(s32) = COPY $sgpr0
+    %1:sgpr(s32) = COPY $sgpr1
+    %2:sgpr(s32) = COPY $sgpr2
+    %3:sgpr(s32) = G_UMAX %0, %1
+    %4:sgpr(s32) = G_UMIN %0, %1
+    %5:sgpr(s32) = G_UMAX %4, %2
+    %6:sgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6
+...
+
+---
+name: umed3_s32_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse0
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MAX_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %3
+...
+
+---
+name: umed3_s32_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse1
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MIN_U32_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %4
+...
+
+---
+name: umed3_s32_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX6-LABEL: name: umed3_s32_vvv_multiuse2
+    ; GFX6: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX6: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX6: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX6: [[V_MAX_U32_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX6: [[V_MAX_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U32_e64 [[V_MIN_U32_e64_]], [[COPY2]], implicit $exec
+    ; GFX6: [[V_MIN_U32_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U32_e64 [[V_MAX_U32_e64_]], [[V_MAX_U32_e64_1]], implicit $exec
+    ; GFX6: S_ENDPGM 0, implicit [[V_MIN_U32_e64_1]], implicit [[V_MAX_U32_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s32) = G_UMAX %0, %1
+    %4:vgpr(s32) = G_UMIN %0, %1
+    %5:vgpr(s32) = G_UMAX %4, %2
+    %6:vgpr(s32) = G_UMIN %3, %5
+    S_ENDPGM 0, implicit %6, implicit %5
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
new file mode 100644
index 0000000000000..c323883ff6139
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-pattern-umed3.s16.mir
@@ -0,0 +1,168 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX8 %s
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX9 %s
+
+---
+name: umed3_s16_vvv
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]]
+    ; GFX9-LABEL: name: umed3_s16_vvv
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MED3_U16_:%[0-9]+]]:vgpr_32 = V_MED3_U16 0, [[COPY]], 0, [[COPY1]], 0, [[COPY2]], 0, 0, implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MED3_U16_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9
+...
+
+---
+name: umed3_s16_vvv_multiuse0
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse0
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse0
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %6
+...
+
+---
+name: umed3_s16_vvv_multiuse1
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse1
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse1
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MIN_U16_e64_]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %7
+...
+
+---
+name: umed3_s16_vvv_multiuse2
+legalized: true
+regBankSelected: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    ; GFX8-LABEL: name: umed3_s16_vvv_multiuse2
+    ; GFX8: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX8: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX8: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX8: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX8: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX8: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX8: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]]
+    ; GFX9-LABEL: name: umed3_s16_vvv_multiuse2
+    ; GFX9: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX9: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX9: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX9: [[V_MAX_U16_e64_:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[COPY]], [[COPY1]], implicit $exec
+    ; GFX9: [[V_MAX_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MAX_U16_e64 [[V_MIN_U16_e64_]], [[COPY2]], implicit $exec
+    ; GFX9: [[V_MIN_U16_e64_1:%[0-9]+]]:vgpr_32 = V_MIN_U16_e64 [[V_MAX_U16_e64_]], [[V_MAX_U16_e64_1]], implicit $exec
+    ; GFX9: S_ENDPGM 0, implicit [[V_MIN_U16_e64_1]], implicit [[V_MAX_U16_e64_1]]
+    %0:vgpr(s32) = COPY $vgpr0
+    %1:vgpr(s32) = COPY $vgpr1
+    %2:vgpr(s32) = COPY $vgpr2
+    %3:vgpr(s16) = G_TRUNC %0
+    %4:vgpr(s16) = G_TRUNC %1
+    %5:vgpr(s16) = G_TRUNC %2
+
+    %6:vgpr(s16) = G_UMAX %3, %4
+    %7:vgpr(s16) = G_UMIN %3, %4
+    %8:vgpr(s16) = G_UMAX %7, %5
+    %9:vgpr(s16) = G_UMIN %6, %8
+
+    S_ENDPGM 0, implicit %9, implicit %8
+...

From de08244d3da52215b8c63bc70d0a3c9e07b620a3 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Wed, 8 Jan 2020 10:53:22 -0500
Subject: [PATCH 74/89] TableGen: Remove unused code

---
 llvm/utils/TableGen/CodeGenDAGPatterns.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.h b/llvm/utils/TableGen/CodeGenDAGPatterns.h
index c61b55052533a..2c081b670609d 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.h
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.h
@@ -1195,12 +1195,6 @@ class CodeGenDAGPatterns {
     return F->second;
   }
 
-  typedef std::map<Record*, NodeXForm, LessRecordByID>::const_iterator
-          nx_iterator;
-  nx_iterator nx_begin() const { return SDNodeXForms.begin(); }
-  nx_iterator nx_end() const { return SDNodeXForms.end(); }
-
-
   const ComplexPattern &getComplexPattern(Record *R) const {
     auto F = ComplexPatterns.find(R);
     assert(F != ComplexPatterns.end() && "Unknown addressing mode!");

From 0fa8f701ccf6f29544394b358f38b748e3f7ab24 Mon Sep 17 00:00:00 2001
From: Sid Manning <sidneym@quicinc.com>
Date: Thu, 2 Jan 2020 12:54:59 -0600
Subject: [PATCH 75/89] [ELF][Hexagon] Add support for IE relocations

Differential Revision: https://reviews.llvm.org/D71143
---
 lld/ELF/Arch/Hexagon.cpp      | 22 ++++++++++
 lld/ELF/Relocations.cpp       |  4 +-
 lld/test/ELF/hexagon-tls-ie.s | 78 +++++++++++++++++++++++++++++++++++
 3 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/ELF/hexagon-tls-ie.s

diff --git a/lld/ELF/Arch/Hexagon.cpp b/lld/ELF/Arch/Hexagon.cpp
index d29932dd42a2e..106bc9bab5bd2 100644
--- a/lld/ELF/Arch/Hexagon.cpp
+++ b/lld/ELF/Arch/Hexagon.cpp
@@ -54,6 +54,7 @@ Hexagon::Hexagon() {
   // Hexagon Linux uses 64K pages by default.
   defaultMaxPageSize = 0x10000;
   noneRel = R_HEX_NONE;
+  tlsGotRel = R_HEX_TPREL_32;
 }
 
 uint32_t Hexagon::calcEFlags() const {
@@ -115,6 +116,11 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s,
   case R_HEX_B22_PCREL_X:
   case R_HEX_B32_PCREL_X:
     return R_PLT_PC;
+  case R_HEX_IE_32_6_X:
+  case R_HEX_IE_16_X:
+  case R_HEX_IE_HI16:
+  case R_HEX_IE_LO16:
+    return R_GOT;
   case R_HEX_GOTREL_11_X:
   case R_HEX_GOTREL_16_X:
   case R_HEX_GOTREL_32_6_X:
@@ -125,6 +131,13 @@ RelExpr Hexagon::getRelExpr(RelType type, const Symbol &s,
   case R_HEX_GOT_16_X:
   case R_HEX_GOT_32_6_X:
     return R_GOTPLT;
+  case R_HEX_IE_GOT_11_X:
+  case R_HEX_IE_GOT_16_X:
+  case R_HEX_IE_GOT_32_6_X:
+  case R_HEX_IE_GOT_HI16:
+  case R_HEX_IE_GOT_LO16:
+    config->hasStaticTlsModel = true;
+    return R_GOTPLT;
   case R_HEX_TPREL_11_X:
   case R_HEX_TPREL_16:
   case R_HEX_TPREL_16_X:
@@ -227,6 +240,7 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     or32le(loc, applyMask(0x00203fe0, val & 0x3f));
     break;
   case R_HEX_11_X:
+  case R_HEX_IE_GOT_11_X:
   case R_HEX_GOT_11_X:
   case R_HEX_GOTREL_11_X:
   case R_HEX_TPREL_11_X:
@@ -236,6 +250,8 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     or32le(loc, applyMask(0x000007e0, val));
     break;
   case R_HEX_16_X: // These relocs only have 6 effective bits.
+  case R_HEX_IE_16_X:
+  case R_HEX_IE_GOT_16_X:
   case R_HEX_GOT_16_X:
   case R_HEX_GOTREL_16_X:
   case R_HEX_TPREL_16_X:
@@ -251,6 +267,8 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
   case R_HEX_32_6_X:
   case R_HEX_GOT_32_6_X:
   case R_HEX_GOTREL_32_6_X:
+  case R_HEX_IE_GOT_32_6_X:
+  case R_HEX_IE_32_6_X:
   case R_HEX_TPREL_32_6_X:
     or32le(loc, applyMask(0x0fff3fff, val >> 6));
     break;
@@ -285,11 +303,15 @@ void Hexagon::relocateOne(uint8_t *loc, RelType type, uint64_t val) const {
     break;
   case R_HEX_GOTREL_HI16:
   case R_HEX_HI16:
+  case R_HEX_IE_GOT_HI16:
+  case R_HEX_IE_HI16:
   case R_HEX_TPREL_HI16:
     or32le(loc, applyMask(0x00c03fff, val >> 16));
     break;
   case R_HEX_GOTREL_LO16:
   case R_HEX_LO16:
+  case R_HEX_IE_GOT_LO16:
+  case R_HEX_IE_LO16:
   case R_HEX_TPREL_LO16:
     or32le(loc, applyMask(0x00c03fff, val));
     break;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 8025779d18dc1..1df8a157cfe5e 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -177,7 +177,9 @@ handleTlsRelocation(RelType type, Symbol &sym, InputSectionBase &c,
     return 1;
   }
 
-  bool canRelax = config->emachine != EM_ARM && config->emachine != EM_RISCV;
+  bool canRelax = config->emachine != EM_ARM &&
+                  config->emachine != EM_HEXAGON &&
+                  config->emachine != EM_RISCV;
 
   // If we are producing an executable and the symbol is non-preemptable, it
   // must be defined and the code sequence can be relaxed to use Local-Exec.
diff --git a/lld/test/ELF/hexagon-tls-ie.s b/lld/test/ELF/hexagon-tls-ie.s
new file mode 100644
index 0000000000000..ea05279473116
--- /dev/null
+++ b/lld/test/ELF/hexagon-tls-ie.s
@@ -0,0 +1,78 @@
+# REQUIRES: hexagon
+# RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
+# RUN: llvm-readobj -r %t.o | FileCheck -check-prefix=RELOC %s
+# RUN: ld.lld %t.o -o %t
+## shared needs -z notext because of the R_HEX_IE_16/32_X(R_GOT) static
+## relocations
+# RUN: ld.lld -z notext -shared %t.o -o %t.so
+# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t | FileCheck %s
+# RUN:  llvm-readobj -x .got %t | FileCheck -check-prefix=GOT %s
+# RUN: llvm-objdump -d --no-show-raw-insn --print-imm-hex %t.so | \
+# RUN: FileCheck -check-prefix=SHARED %s
+# RUN: llvm-readobj -r  %t.so | FileCheck -check-prefix=RELA %s
+
+	.globl	_start
+	.type	_start, @function
+_start:
+
+# RELOC:      0x0 R_HEX_IE_32_6_X a 0x0
+# RELOC-NEXT: 0x4 R_HEX_IE_16_X a 0x0
+# CHECK:      {   immext(#0x30180)
+# CHECK-NEXT:     r2 = memw(##0x301a4) }
+                  r2 = memw(##a@IE)
+
+# RELOC-NEXT: 0x8 R_HEX_IE_LO16 a 0x0
+# CHECK: {       r2.l = #0x1a4 }
+                 r2.l = #a@IE
+# RELOC-NEXT: 0xC R_HEX_IE_HI16 a 0x0
+# CHECK: {       r2.h = #0x3 }
+                 r2.h = #a@IE
+
+
+# GOT: Hex dump of section '.got':
+# GOT-NEXT: 0x000301a4 f0ffffff f4ffffff f8ffffff fcffffff
+                 r2 = memw(##a@IE)
+                 r2 = memw(##b@IE)
+                 r2 = memw(##c@IE)
+                 r2 = memw(##d@IE)
+
+# RELOC:      0x30 R_HEX_IE_GOT_32_6_X a 0x0
+# RELOC-NEXT: 0x34 R_HEX_IE_GOT_16_X a 0x0
+# SHARED:      { immext(#0xfffeffc0)
+# SHARED-NEXT:   r2 = memw(##0xfffefff0) }
+                 r2 = memw(##a@IEGOT)
+
+# RELOC-NEXT: 0x38 R_HEX_IE_GOT_LO16 a 0x0
+# SHARED: {     r2.l = #0xfff0 }
+                r2.l = #a@IEGOT
+# RELOC-NEXT: 0x3C R_HEX_IE_GOT_HI16 a 0x0
+# SHARED: {     r2.h = #0xfffe }
+                r2.h = #a@IEGOT
+
+# RELOC:      0x44 R_HEX_IE_GOT_11_X a 0x0
+# SHARED:    {  immext(#0xfffeffc0)
+# SHARED-NEXT:  r0 = !cmp.eq(r1,##-0x10010) }
+                r0=!cmp.eq(r1,##a@iegot)
+
+# RELA:       0x203C4 R_HEX_TPREL_32 a 0x0
+# RELA-NEXT:  0x203C8 R_HEX_TPREL_32 b 0x0
+# RELA-NEXT:  0x203CC R_HEX_TPREL_32 c 0x0
+# RELA-NEXT:  0x203D0 R_HEX_TPREL_32 d 0x0
+                r2 = memw(##b@IEGOT)
+                r2 = memw(##c@IEGOT)
+                r2 = memw(##d@IEGOT)
+
+
+.section        .tdata,"awT",@progbits
+.globl  a
+a:
+.word 1
+.globl  b
+b:
+.word 2
+.globl  c
+c:
+.word 3
+.globl  d
+d:
+.word 4

From 6c04ef472a8739a60a7935f77edfbacd2f50eb73 Mon Sep 17 00:00:00 2001
From: "@raghesh (Raghesh Aloor)" <raghesh.a@gmail.com>
Date: Thu, 9 Jan 2020 10:52:39 -0500
Subject: [PATCH 76/89] [InstCombine] Z / (1.0 / Y) => (Y * Z)

This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0.
The m_OneUse check is avoided because even in the case of the
multiple uses for 1.0/Y, the number of instructions remain the same
and a division is replaced by a multiplication.

Differential Revision: https://reviews.llvm.org/D72319
---
 llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp | 8 ++++++++
 llvm/test/Transforms/InstCombine/fdiv.ll                 | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index f7b39d98d4923..2774e46151faf 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -1239,6 +1239,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
       Value *YZ = Builder.CreateFMulFMF(Y, Op0, &I);
       return BinaryOperator::CreateFDivFMF(YZ, X, &I);
     }
+    // Z / (1.0 / Y) => (Y * Z)
+    //
+    // This is a special case of Z / (X / Y) => (Y * Z) / X, with X = 1.0. The
+    // m_OneUse check is avoided because even in the case of the multiple uses
+    // for 1.0/Y, the number of instructions remain the same and a division is
+    // replaced by a multiplication.
+    if (match(Op1, m_FDiv(m_SpecificFP(1.0), m_Value(Y))))
+      return BinaryOperator::CreateFMulFMF(Y, Op0, &I);
   }
 
   if (I.hasAllowReassoc() && Op0->hasOneUse() && Op1->hasOneUse()) {
diff --git a/llvm/test/Transforms/InstCombine/fdiv.ll b/llvm/test/Transforms/InstCombine/fdiv.ll
index 178fe1bcef439..ec1119cb24573 100644
--- a/llvm/test/Transforms/InstCombine/fdiv.ll
+++ b/llvm/test/Transforms/InstCombine/fdiv.ll
@@ -187,12 +187,12 @@ define float @div_with_div_denominator_extra_use(float %x, float %y, float %z) {
   ret float %div2
 }
 
-; Z / (1.0 / Y)
+; Z / (1.0 / Y) ==> Y * Z
 
 define float @div_with_div_denominator_with_one_as_numerator_extra_use(float %x, float %y, float %z) {
 ; CHECK-LABEL: @div_with_div_denominator_with_one_as_numerator_extra_use(
 ; CHECK-NEXT:    [[DIV1:%.*]] = fdiv float 1.000000e+00, [[Y:%.*]]
-; CHECK-NEXT:    [[DIV2:%.*]] = fdiv reassoc arcp float [[Z:%.*]], [[DIV1]]
+; CHECK-NEXT:    [[DIV2:%.*]] = fmul reassoc arcp float [[Y]], [[Z:%.*]]
 ; CHECK-NEXT:    call void @use_f32(float [[DIV1]])
 ; CHECK-NEXT:    ret float [[DIV2]]
 ;

From fb76c7934dce2ede7e04097edb455eba58f6ebfd Mon Sep 17 00:00:00 2001
From: Sergej Jaskiewicz <jaskiewiczs@icloud.com>
Date: Thu, 26 Dec 2019 12:28:39 +0300
Subject: [PATCH 77/89] [libcxxabi] Allow tests to link with static
 libc++abi/libc++ even if the shared version is present

Summary:
Right now the only way to force libc++abi tests to link with the static version of libc++abi is to set `LIBCXXABI_ENABLE_SHARED` to `OFF`. However, this doesn't work when libc++abi is built as standalone project because of [this](https://github.com/llvm/llvm-project/blob/54c522420347e58aa7bae1892cf5c5672b57c875/libcxxabi/CMakeLists.txt#L503-L519).

This change allows specifying the version of the library for tests to link with.

This is useful for remote testing, for example, with `SSHExecutor`, where we _have_ to link with libc++abi statically.

Two new CMake options are introduced here: `LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI` and `LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX`. They can be set to `OFF` to tell the test utility to link tests with the static libraries.

It shouldn't break anything, because the default values of these options are set such that the test utility will behave the same way.

Reviewers: EricWF, mclow.lists, phosek, mehdi_amini, ldionne, jroelofs, bcraig

Subscribers: mgorny, christof, ldionne, libcxx-commits

Tags: #libc

Differential Revision: https://reviews.llvm.org/D71894
---
 libcxxabi/CMakeLists.txt       | 30 +++++++++++++++++++++++++++++-
 libcxxabi/test/CMakeLists.txt  |  2 ++
 libcxxabi/test/lit.site.cfg.in |  4 ++--
 3 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index 0ddcd5f971f7f..f3698e9c46e1b 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -92,6 +92,14 @@ usual symlinks pointing to that.")
 option(LIBCXXABI_ENABLE_SHARED "Build libc++abi as a shared library." ON)
 option(LIBCXXABI_ENABLE_STATIC "Build libc++abi as a static library." ON)
 
+option(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI
+       "Whether the libc++abi tests should link with the shared libc++abi library"
+       ${LIBCXXABI_ENABLE_SHARED})
+
+option(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX
+       "Whether the libc++abi tests should link with the shared libc++ library"
+       ${LIBCXX_ENABLE_SHARED})
+
 cmake_dependent_option(LIBCXXABI_INSTALL_STATIC_LIBRARY
   "Install the static libc++abi library." ON
   "LIBCXXABI_ENABLE_STATIC;LIBCXXABI_INSTALL_LIBRARY" OFF)
@@ -115,6 +123,26 @@ if (NOT LIBCXXABI_ENABLE_SHARED AND NOT LIBCXXABI_ENABLE_STATIC)
   message(FATAL_ERROR "libc++abi must be built as either a shared or static library.")
 endif()
 
+if(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI AND NOT LIBCXXABI_ENABLE_SHARED)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI being ON requires LIBCXXABI_ENABLE_SHARED to be ON")
+endif()
+
+if(NOT LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI AND NOT LIBCXXABI_ENABLE_STATIC)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI being OFF requires LIBCXXABI_ENABLE_STATIC to be ON")
+endif()
+
+if(DEFINED LIBCXX_ENABLE_SHARED 
+   AND LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX
+   AND NOT LIBCXX_ENABLE_SHARED)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX being ON requires LIBCXX_ENABLE_SHARED to be ON")
+endif()
+
+if(DEFINED LIBCXX_ENABLE_STATIC
+   AND NOT LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX 
+   AND NOT LIBCXX_ENABLE_STATIC)
+  message(FATAL_ERROR "LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX being OFF requires LIBCXX_ENABLE_STATIC to be ON")
+endif()
+
 if (LLVM_EXTERNAL_LIBCXX_SOURCE_DIR)
   set(LIBCXXABI_LIBCXX_SRC_DIRS ${LLVM_EXTERNAL_LIBCXX_SOURCE_DIR})
 else()
@@ -209,7 +237,7 @@ set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBCXXABI_LIBRARY_DIR})
 # directory.
 if (NOT LIBCXXABI_LIBCXX_LIBRARY_PATH)
   set(LIBCXXABI_LIBCXX_LIBRARY_PATH "${LIBCXXABI_LIBRARY_DIR}" CACHE PATH
-      "The path to libc++ library.")
+      "The path to libc++ library." FORCE)
 endif()
 
 # Check that we can build with 32 bits if requested.
diff --git a/libcxxabi/test/CMakeLists.txt b/libcxxabi/test/CMakeLists.txt
index 60e052d20710c..23c0dac5b8568 100644
--- a/libcxxabi/test/CMakeLists.txt
+++ b/libcxxabi/test/CMakeLists.txt
@@ -20,6 +20,8 @@ pythonize_bool(LIBCXXABI_USE_LLVM_UNWINDER)
 pythonize_bool(LIBCXXABI_USE_COMPILER_RT)
 pythonize_bool(LIBCXXABI_BUILD_EXTERNAL_THREAD_LIBRARY)
 pythonize_bool(LIBCXX_ENABLE_PARALLEL_ALGORITHMS)
+pythonize_bool(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX)
+pythonize_bool(LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI)
 set(LIBCXXABI_TARGET_INFO "libcxx.test.target_info.LocalTI" CACHE STRING
     "TargetInfo to use when setting up test environment.")
 set(LIBCXXABI_EXECUTOR "None" CACHE STRING
diff --git a/libcxxabi/test/lit.site.cfg.in b/libcxxabi/test/lit.site.cfg.in
index 8ac2fa45cab0b..8125e89a4533c 100644
--- a/libcxxabi/test/lit.site.cfg.in
+++ b/libcxxabi/test/lit.site.cfg.in
@@ -16,8 +16,8 @@ config.sanitizer_library        = "@LIBCXXABI_SANITIZER_LIBRARY@"
 config.enable_32bit             = @LIBCXXABI_BUILD_32_BITS@
 config.target_info              = "@LIBCXXABI_TARGET_INFO@"
 config.executor                 = "@LIBCXXABI_EXECUTOR@"
-config.libcxxabi_shared         = @LIBCXXABI_ENABLE_SHARED@
-config.enable_shared            = @LIBCXX_ENABLE_SHARED@
+config.libcxxabi_shared         = @LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXXABI@
+config.enable_shared            = @LIBCXXABI_LINK_TESTS_WITH_SHARED_LIBCXX@
 config.enable_exceptions        = @LIBCXXABI_ENABLE_EXCEPTIONS@
 config.host_triple              = "@LLVM_HOST_TRIPLE@"
 config.target_triple            = "@TARGET_TRIPLE@"

From ea9888b8f6f20887647b77ebf5864d647fd2ea44 Mon Sep 17 00:00:00 2001
From: David Goldman <davg@google.com>
Date: Wed, 8 Jan 2020 17:01:59 -0500
Subject: [PATCH 78/89] [clangd] Respect `--sysroot` argument if it is set

Summary:
- Since `--sysroot` is a superset of the `-isysroot` argument, we
  shouldn't add the `-isysroot` if we detect a `--sysroot` flag.

Reviewers: sammccall

Subscribers: ilya-biryukov, MaskRay, jkorous, arphaman, kadircet, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D72415
---
 clang-tools-extra/clangd/CompileCommands.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/clangd/CompileCommands.cpp b/clang-tools-extra/clangd/CompileCommands.cpp
index b1eca02813b38..f11b1236803c7 100644
--- a/clang-tools-extra/clangd/CompileCommands.cpp
+++ b/clang-tools-extra/clangd/CompileCommands.cpp
@@ -155,7 +155,9 @@ void CommandMangler::adjust(std::vector<std::string> &Cmd) const {
   if (ResourceDir && !Has("-resource-dir"))
     Cmd.push_back(("-resource-dir=" + *ResourceDir));
 
-  if (Sysroot && !Has("-isysroot")) {
+  // Don't set `-isysroot` if it is already set or if `--sysroot` is set.
+  // `--sysroot` is a superset of the `-isysroot` argument.
+  if (Sysroot && !Has("-isysroot") && !Has("--sysroot")) {
     Cmd.push_back("-isysroot");
     Cmd.push_back(*Sysroot);
   }

From 93a1e9c90c96a9130352bf358d7777f0379ebb48 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 9 Jan 2020 08:13:25 -0800
Subject: [PATCH 79/89] [lldb/SWIG] Add missing '\' in macros again

Making the string conversion operator a macro unintentionally dropped
the backslash before '\n' and '\r' and was therefore incorrectly
stripping 'n' and 'r' from the object description.
---
 lldb/scripts/macros.swig | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/lldb/scripts/macros.swig b/lldb/scripts/macros.swig
index e0756c2f17932..0387f27f3cb9b 100644
--- a/lldb/scripts/macros.swig
+++ b/lldb/scripts/macros.swig
@@ -6,7 +6,7 @@
     $self->GetDescription (stream, Level);
     const char *desc = stream.GetData();
     size_t desc_len = stream.GetSize();
-    if (desc_len > 0 && (desc[desc_len-1] == 'n' || desc[desc_len-1] == 'r')) {
+    if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r')) {
       --desc_len;
     }
     return std::string(desc, desc_len);
@@ -23,7 +23,7 @@
     $self->GetDescription (stream);
     const char *desc = stream.GetData();
     size_t desc_len = stream.GetSize();
-    if (desc_len > 0 && (desc[desc_len-1] == 'n' || desc[desc_len-1] == 'r')) {
+    if (desc_len > 0 && (desc[desc_len-1] == '\n' || desc[desc_len-1] == '\r')) {
       --desc_len;
     }
     return std::string(desc, desc_len);

From 45c971f7eef18ef2b77a5f64133dbd7bd5939d5f Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 9 Jan 2020 08:15:01 -0800
Subject: [PATCH 80/89] [lldb/Lua] Make lldb.debugger et al available to Lua

The Python script interpreter makes the current debugger, target,
process, thread and frame available to interactive scripting sessions
through convenience variables. This patch does the same for Lua.

Differential revision: https://reviews.llvm.org/D71801
---
 .../Plugins/ScriptInterpreter/Lua/Lua.cpp     |  2 +-
 .../Plugins/ScriptInterpreter/Lua/Lua.h       |  2 +-
 .../Lua/ScriptInterpreterLua.cpp              | 35 ++++++++++++++++++-
 .../Lua/ScriptInterpreterLua.h                |  4 +++
 .../Lua/Inputs/independent_state.in           |  6 ++++
 .../Lua/Inputs/nested_sessions.in             |  6 ++++
 .../Lua/Inputs/nested_sessions_2.in           |  2 ++
 .../Lua/convenience_variables.test            | 17 +++++++++
 .../Lua/independent_state.test                |  6 ++++
 .../Lua/nested_sessions.test                  | 12 +++++++
 10 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test
 create mode 100644 lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test

diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
index dc64139fa4e5c..1dd0a9eade0c8 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.cpp
@@ -10,9 +10,9 @@
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace lldb_private;
+using namespace lldb;
 
 llvm::Error Lua::Run(llvm::StringRef buffer) {
-  std::lock_guard<std::mutex> lock(m_mutex);
   int error =
       luaL_loadbuffer(m_lua_state, buffer.data(), buffer.size(), "buffer") ||
       lua_pcall(m_lua_state, 0, 0, 0);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
index ed1d159590ac5..adc6c61184367 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/Lua.h
@@ -9,6 +9,7 @@
 #ifndef liblldb_Lua_h_
 #define liblldb_Lua_h_
 
+#include "lldb/lldb-types.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Error.h"
 
@@ -38,7 +39,6 @@ class Lua {
   llvm::Error Run(llvm::StringRef buffer);
 
 private:
-  std::mutex m_mutex;
   lua_State *m_lua_state;
 };
 
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
index d5423b78b8c43..e46851c450920 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.cpp
@@ -27,7 +27,13 @@ class IOHandlerLuaInterpreter : public IOHandlerDelegate,
       : IOHandlerEditline(debugger, IOHandler::Type::LuaInterpreter, "lua",
                           ">>> ", "..> ", true, debugger.GetUseColor(), 0,
                           *this, nullptr),
-        m_script_interpreter(script_interpreter) {}
+        m_script_interpreter(script_interpreter) {
+    llvm::cantFail(m_script_interpreter.EnterSession(debugger.GetID()));
+  }
+
+  ~IOHandlerLuaInterpreter() {
+    llvm::cantFail(m_script_interpreter.LeaveSession());
+  }
 
   void IOHandlerInputComplete(IOHandler &io_handler,
                               std::string &data) override {
@@ -89,6 +95,33 @@ void ScriptInterpreterLua::Initialize() {
 
 void ScriptInterpreterLua::Terminate() {}
 
+llvm::Error ScriptInterpreterLua::EnterSession(user_id_t debugger_id) {
+  if (m_session_is_active)
+    return llvm::Error::success();
+
+  const char *fmt_str =
+      "lldb.debugger = lldb.SBDebugger.FindDebuggerWithID({0}); "
+      "lldb.target = lldb.debugger:GetSelectedTarget(); "
+      "lldb.process = lldb.target:GetProcess(); "
+      "lldb.thread = lldb.process:GetSelectedThread(); "
+      "lldb.frame = lldb.thread:GetSelectedFrame()";
+  return m_lua->Run(llvm::formatv(fmt_str, debugger_id).str());
+}
+
+llvm::Error ScriptInterpreterLua::LeaveSession() {
+  if (!m_session_is_active)
+    return llvm::Error::success();
+
+  m_session_is_active = false;
+
+  llvm::StringRef str = "lldb.debugger = nil; "
+                        "lldb.target = nil; "
+                        "lldb.process = nil; "
+                        "lldb.thread = nil; "
+                        "lldb.frame = nil";
+  return m_lua->Run(str);
+}
+
 lldb::ScriptInterpreterSP
 ScriptInterpreterLua::CreateInstance(Debugger &debugger) {
   return std::make_shared<ScriptInterpreterLua>(debugger);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
index b34c7d0e82176..550e1035567ca 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Lua/ScriptInterpreterLua.h
@@ -43,8 +43,12 @@ class ScriptInterpreterLua : public ScriptInterpreter {
 
   Lua &GetLua();
 
+  llvm::Error EnterSession(lldb::user_id_t debugger_id);
+  llvm::Error LeaveSession();
+
 private:
   std::unique_ptr<Lua> m_lua;
+  bool m_session_is_active = false;
 };
 
 } // namespace lldb_private
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in
new file mode 100644
index 0000000000000..6e15a8ff663ec
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/independent_state.in
@@ -0,0 +1,6 @@
+script foobar = 40 + 7
+script print(foobar)
+script d = lldb.SBDebugger.Create()
+script d:HandleCommand("script foobar = 40 + 2")
+script print(foobar)
+script d:HandleCommand("script print(foobar)")
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in
new file mode 100644
index 0000000000000..75c57e364cac7
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions.in
@@ -0,0 +1,6 @@
+script
+print(lldb.target, lldb.debugger:GetSelectedTarget())
+lldb.debugger:SetSelectedTarget(lldb.debugger:GetTargetAtIndex(0))
+print(lldb.target, lldb.debugger:GetSelectedTarget())
+lldb.debugger:HandleCommand("script print(lldb.target, lldb.debugger:GetSelectedTarget())")
+print(lldb.target, lldb.debugger:GetSelectedTarget())
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in
new file mode 100644
index 0000000000000..a8cc2a57a55b3
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/Inputs/nested_sessions_2.in
@@ -0,0 +1,2 @@
+script
+print(lldb.target, lldb.debugger:GetSelectedTarget())
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test
new file mode 100644
index 0000000000000..022f2e38db49a
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/convenience_variables.test
@@ -0,0 +1,17 @@
+# REQUIRES: lua
+#
+# This tests that the convenience variables are not nil. Given that there is no
+# target we only expect the debugger to be valid.
+#
+# RUN: cat %s | %lldb --script-language lua 2>&1 | FileCheck %s
+script
+print(string.format("lldb.debugger is valid: %s", lldb.debugger:IsValid()))
+print(string.format("lldb.target is valid: %s", lldb.target:IsValid()))
+print(string.format("lldb.process is valid: %s", lldb.process:IsValid()))
+print(string.format("lldb.thread is valid: %s", lldb.thread:IsValid()))
+print(string.format("lldb.frame is valid: %s", lldb.frame:IsValid()))
+# CHECK: debugger is valid: true
+# CHECK: target is valid: false
+# CHECK: process is valid: false
+# CHECK: thread is valid: false
+# CHECK: frame is valid: false
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test b/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test
new file mode 100644
index 0000000000000..2ade1b91c1cb6
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/independent_state.test
@@ -0,0 +1,6 @@
+# REQUIRES: lua
+#
+# RUN:  %lldb --script-language lua -s %S/Inputs/independent_state.in 2>&1 | FileCheck %s
+# CHECK: 47
+# CHECK: 47
+# CHECK: 42
diff --git a/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test b/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test
new file mode 100644
index 0000000000000..a81418b6af61d
--- /dev/null
+++ b/lldb/test/Shell/ScriptInterpreter/Lua/nested_sessions.test
@@ -0,0 +1,12 @@
+# REQUIRES: lua
+# RUN: mkdir -p %t
+# RUN: echo "int main() { return 0; }" | %clang_host -x c - -o %t/foo
+# RUN: echo "int main() { return 0; }" | %clang_host -x c - -o %t/bar
+# RUN:  %lldb --script-language lua -o "file %t/bar" -o "file %t/foo" -s %S/Inputs/nested_sessions.in  -s %S/Inputs/nested_sessions_2.in 2>&1 | FileCheck %s
+# CHECK: script
+# CHECK-NEXT: foo foo
+# CHECK-NEXT: foo bar
+# CHECK-NEXT: foo bar
+# CHECK-NEXT: foo bar
+# CHECK: script
+# CHECK-NEXT: bar bar

From d27a15fed79c8e8484825ce6c3ce1682daeb1547 Mon Sep 17 00:00:00 2001
From: Whitney Tsang <whitneyt@ca.ibm.com>
Date: Thu, 9 Jan 2020 16:15:53 +0000
Subject: [PATCH 81/89] [NFCI][LoopUnrollAndJam] Changing LoopUnrollAndJamPass
 to a function pass.

Summary: This patch changes LoopUnrollAndJamPass to a function pass, and
keeps the loops traversal order same as defined in
FunctionToLoopPassAdaptor LoopPassManager.h.

The next patch will change the loop traversal to outer to inner order,
so more loops can be transform.

Discussion in llvm-dev mailing list:
https://groups.google.com/forum/#!topic/llvm-dev/LF4rUjkVI2g
Reviewer: dmgreen, jdoerfert, Meinersbur, kbarton, bmahjour, etiotto
Reviewed By: dmgreen
Subscribers: hiraditya, zzheng, llvm-commits
Tag: LLVM
Differential Revision: https://reviews.llvm.org/D72230
---
 .../Transforms/Scalar/LoopUnrollAndJamPass.h  |   7 +-
 llvm/lib/Passes/PassBuilder.cpp               |   3 +-
 llvm/lib/Passes/PassRegistry.def              |   2 +-
 .../Scalar/LoopUnrollAndJamPass.cpp           | 104 +++++++++++-------
 .../LoopUnrollAndJam/dependencies.ll          |   1 +
 .../Transforms/LoopUnrollAndJam/disable.ll    |   1 +
 .../LoopUnrollAndJam/pragma-explicit.ll       |   1 +
 .../LoopUnrollAndJam/unroll-and-jam.ll        |   1 +
 8 files changed, 70 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
index 7920269b0fb27..233963528595e 100644
--- a/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopUnrollAndJamPass.h
@@ -15,9 +15,7 @@
 
 namespace llvm {
 
-class Loop;
-struct LoopStandardAnalysisResults;
-class LPMUpdater;
+class Function;
 
 /// A simple loop rotation transformation.
 class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
@@ -25,8 +23,7 @@ class LoopUnrollAndJamPass : public PassInfoMixin<LoopUnrollAndJamPass> {
 
 public:
   explicit LoopUnrollAndJamPass(int OptLevel = 2) : OptLevel(OptLevel) {}
-  PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
-                        LoopStandardAnalysisResults &AR, LPMUpdater &U);
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 64d748f94f144..646eb7d26cbdd 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -970,8 +970,7 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // across the loop nests.
   // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-    OptimizePM.addPass(
-        createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level)));
+    OptimizePM.addPass(LoopUnrollAndJamPass(Level));
   }
   OptimizePM.addPass(LoopUnrollPass(
       LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling,
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 66b38872b386e..3efb57cd35890 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -235,6 +235,7 @@ FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
 FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
+FUNCTION_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
@@ -307,7 +308,6 @@ LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
 LOOP_PASS("strength-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
 LOOP_PASS("irce", IRCEPass())
-LOOP_PASS("unroll-and-jam", LoopUnrollAndJamPass())
 LOOP_PASS("unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 558f63113db63..92ad8dafa5abc 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -427,51 +427,76 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   return UnrollResult;
 }
 
+static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
+                                  ScalarEvolution &SE,
+                                  const TargetTransformInfo &TTI,
+                                  AssumptionCache &AC, DependenceInfo &DI,
+                                  OptimizationRemarkEmitter &ORE,
+                                  int OptLevel) {
+  bool DidSomething = false;
+
+  // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA.
+  // Since simplification may add new inner loops, it has to run before the
+  // legality and profitability checks. This means running the loop unroll and jam pass
+  // will simplify all loops, regardless of whether anything end up being
+  // unroll and jammed.
+  for (auto &L : LI) {
+    DidSomething |=
+        simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
+    DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  internal::appendLoopsToWorklist(reverse(LI), Worklist);
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    formLCSSA(*L, DT, &LI, &SE);
+    LoopUnrollResult Result =
+        tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
+    if (Result != LoopUnrollResult::Unmodified)
+      DidSomething = true;
+  }
+
+  return DidSomething;
+}
+
 namespace {
 
-class LoopUnrollAndJam : public LoopPass {
+class LoopUnrollAndJam : public FunctionPass {
 public:
   static char ID; // Pass ID, replacement for typeid
   unsigned OptLevel;
 
-  LoopUnrollAndJam(int OptLevel = 2) : LoopPass(ID), OptLevel(OptLevel) {
+  LoopUnrollAndJam(int OptLevel = 2) : FunctionPass(ID), OptLevel(OptLevel) {
     initializeLoopUnrollAndJamPass(*PassRegistry::getPassRegistry());
   }
 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L))
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
       return false;
 
-    Function &F = *L->getHeader()->getParent();
-
     auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto &DI = getAnalysis<DependenceAnalysisWrapperPass>().getDI();
-    // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
-    // pass.  Function analyses need to be preserved across loop transformations
-    // but ORE cannot be preserved (see comment before the pass definition).
-    OptimizationRemarkEmitter ORE(&F);
-
-    LoopUnrollResult Result =
-        tryToUnrollAndJamLoop(L, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
 
-    if (Result == LoopUnrollResult::FullyUnrolled)
-      LPM.markLoopAsDeleted(*L);
-
-    return Result != LoopUnrollResult::Unmodified;
+    return tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel);
   }
 
   /// This transformation requires natural loop information & requires that
   /// loop preheaders be inserted into the CFG...
   void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<DependenceAnalysisWrapperPass>();
-    getLoopAnalysisUsage(AU);
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 };
 
@@ -481,10 +506,13 @@ char LoopUnrollAndJam::ID = 0;
 
 INITIALIZE_PASS_BEGIN(LoopUnrollAndJam, "loop-unroll-and-jam",
                       "Unroll and Jam loops", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DependenceAnalysisWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(LoopUnrollAndJam, "loop-unroll-and-jam",
                     "Unroll and Jam loops", false, false)
 
@@ -492,26 +520,18 @@ Pass *llvm::createLoopUnrollAndJamPass(int OptLevel) {
   return new LoopUnrollAndJam(OptLevel);
 }
 
-PreservedAnalyses LoopUnrollAndJamPass::run(Loop &L, LoopAnalysisManager &AM,
-                                            LoopStandardAnalysisResults &AR,
-                                            LPMUpdater &) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error(
-        "LoopUnrollAndJamPass: OptimizationRemarkEmitterAnalysis not cached at "
-        "a higher level");
-
-  DependenceInfo DI(F, &AR.AA, &AR.SE, &AR.LI);
-
-  LoopUnrollResult Result = tryToUnrollAndJamLoop(
-      &L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, DI, *ORE, OptLevel);
-
-  if (Result == LoopUnrollResult::Unmodified)
+PreservedAnalyses LoopUnrollAndJamPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  ScalarEvolution &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+  TargetTransformInfo &TTI = AM.getResult<TargetIRAnalysis>(F);
+  AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  DependenceInfo &DI = AM.getResult<DependenceAnalysis>(F);
+  OptimizationRemarkEmitter &ORE =
+      AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+
+  if (!tryToUnrollAndJamLoop(F, DT, LI, SE, TTI, AC, DI, ORE, OptLevel))
     return PreservedAnalyses::all();
 
   return getLoopPassPreservedAnalyses();
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
index 890683043b4bf..d584238745cfe 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/dependencies.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -basicaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
index 4a00937b9c58c..6e879896c55fc 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/disable.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
+; RUN: opt -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 -pass-remarks=loop-unroll-and-jam < %s -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
index 5254c779d0f87..f2c12702f9e89 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/pragma-explicit.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-unroll-and-jam -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-runtime -unroll-partial-threshold=60 < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
index bdb47c27f1cfc..7580b50f8703b 100644
--- a/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
+++ b/llvm/test/Transforms/LoopUnrollAndJam/unroll-and-jam.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -basicaa -tbaa -loop-unroll-and-jam -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
+; RUN: opt -aa-pipeline=type-based-aa,basic-aa -passes='unroll-and-jam' -allow-unroll-and-jam -unroll-and-jam-count=4 -unroll-remainder < %s -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 

From cee4a1c957426e9477c8579ff960c8c2bd4af9e2 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sguelton@redhat.com>
Date: Wed, 11 Dec 2019 21:30:10 +0100
Subject: [PATCH 82/89] Improve support of GNU mempcpy

- Lower to the memcpy intrinsic
- Raise warnings when size/bounds are known

Differential Revision: https://reviews.llvm.org/D71374
---
 clang/include/clang/Basic/Builtins.def |  1 +
 clang/lib/AST/Decl.cpp                 |  7 +++++++
 clang/lib/CodeGen/CGBuiltin.cpp        | 10 ++++++++--
 clang/lib/Sema/SemaChecking.cpp        |  7 +++++--
 clang/test/Analysis/bstring.c          |  6 ++++++
 clang/test/CodeGen/mempcpy-libcall.c   | 12 ++++++++++++
 6 files changed, 39 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/CodeGen/mempcpy-libcall.c

diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 51d3500df8aee..3f166bfc22643 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -984,6 +984,7 @@ LIBBUILTIN(longjmp, "vJi",        "fr",    "setjmp.h", ALL_LANGUAGES)
 LIBBUILTIN(alloca, "v*z",         "f",     "stdlib.h", ALL_GNU_LANGUAGES)
 // POSIX string.h
 LIBBUILTIN(memccpy, "v*v*vC*iz",  "f",     "string.h", ALL_GNU_LANGUAGES)
+LIBBUILTIN(mempcpy, "v*v*vC*z",   "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(stpcpy, "c*c*cC*",     "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(stpncpy, "c*c*cC*z",   "f",     "string.h", ALL_GNU_LANGUAGES)
 LIBBUILTIN(strdup, "c*cC*",       "f",     "string.h", ALL_GNU_LANGUAGES)
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index e760fff2ff6ca..be59d88b73f13 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -3879,6 +3879,11 @@ unsigned FunctionDecl::getMemoryFunctionKind() const {
   case Builtin::BImemcpy:
     return Builtin::BImemcpy;
 
+  case Builtin::BI__builtin_mempcpy:
+  case Builtin::BI__builtin___mempcpy_chk:
+  case Builtin::BImempcpy:
+    return Builtin::BImempcpy;
+
   case Builtin::BI__builtin_memmove:
   case Builtin::BI__builtin___memmove_chk:
   case Builtin::BImemmove:
@@ -3936,6 +3941,8 @@ unsigned FunctionDecl::getMemoryFunctionKind() const {
         return Builtin::BImemset;
       else if (FnInfo->isStr("memcpy"))
         return Builtin::BImemcpy;
+      else if (FnInfo->isStr("mempcpy"))
+        return Builtin::BImempcpy;
       else if (FnInfo->isStr("memmove"))
         return Builtin::BImemmove;
       else if (FnInfo->isStr("memcmp"))
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 4b89b1b83a6a0..3fadf09c460d6 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2500,7 +2500,9 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(nullptr);
   }
   case Builtin::BImemcpy:
-  case Builtin::BI__builtin_memcpy: {
+  case Builtin::BI__builtin_memcpy:
+  case Builtin::BImempcpy:
+  case Builtin::BI__builtin_mempcpy: {
     Address Dest = EmitPointerWithAlignment(E->getArg(0));
     Address Src = EmitPointerWithAlignment(E->getArg(1));
     Value *SizeVal = EmitScalarExpr(E->getArg(2));
@@ -2509,7 +2511,11 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(),
                         E->getArg(1)->getExprLoc(), FD, 1);
     Builder.CreateMemCpy(Dest, Src, SizeVal, false);
-    return RValue::get(Dest.getPointer());
+    if (BuiltinID == Builtin::BImempcpy ||
+        BuiltinID == Builtin::BI__builtin_mempcpy)
+      return RValue::get(Builder.CreateInBoundsGEP(Dest.getPointer(), SizeVal));
+    else
+      return RValue::get(Dest.getPointer());
   }
 
   case Builtin::BI__builtin_char_memchr:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index d8711fb6bcabb..0fd9cb7c0e494 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -340,7 +340,8 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
   case Builtin::BI__builtin___strncat_chk:
   case Builtin::BI__builtin___strncpy_chk:
   case Builtin::BI__builtin___stpncpy_chk:
-  case Builtin::BI__builtin___memccpy_chk: {
+  case Builtin::BI__builtin___memccpy_chk:
+  case Builtin::BI__builtin___mempcpy_chk: {
     DiagID = diag::warn_builtin_chk_overflow;
     IsChkVariant = true;
     SizeIndex = TheCall->getNumArgs() - 2;
@@ -379,7 +380,9 @@ void Sema::checkFortifiedBuiltinMemoryFunction(FunctionDecl *FD,
   case Builtin::BImemmove:
   case Builtin::BI__builtin_memmove:
   case Builtin::BImemset:
-  case Builtin::BI__builtin_memset: {
+  case Builtin::BI__builtin_memset:
+  case Builtin::BImempcpy:
+  case Builtin::BI__builtin_mempcpy: {
     DiagID = diag::warn_fortify_source_overflow;
     SizeIndex = TheCall->getNumArgs() - 1;
     ObjectIndex = 0;
diff --git a/clang/test/Analysis/bstring.c b/clang/test/Analysis/bstring.c
index 2d53402a9ad36..214f6537e10ed 100644
--- a/clang/test/Analysis/bstring.c
+++ b/clang/test/Analysis/bstring.c
@@ -222,6 +222,9 @@ void mempcpy2 () {
   char dst[1];
 
   mempcpy(dst, src, 4); // expected-warning{{Memory copy function overflows destination buffer}}
+#ifndef VARIANT
+// expected-warning@-2{{'mempcpy' will always overflow; destination buffer has size 1, but size argument is 4}}
+#endif
 }
 
 void mempcpy3 () {
@@ -243,6 +246,9 @@ void mempcpy5() {
   char dst[3];
 
   mempcpy(dst+2, src+2, 2); // expected-warning{{Memory copy function overflows destination buffer}}
+#ifndef VARIANT
+// expected-warning@-2{{'mempcpy' will always overflow; destination buffer has size 1, but size argument is 2}}
+#endif
 }
 
 void mempcpy6() {
diff --git a/clang/test/CodeGen/mempcpy-libcall.c b/clang/test/CodeGen/mempcpy-libcall.c
new file mode 100644
index 0000000000000..b88f494f164df
--- /dev/null
+++ b/clang/test/CodeGen/mempcpy-libcall.c
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -emit-llvm < %s| FileCheck %s
+
+typedef __SIZE_TYPE__ size_t;
+
+void *mempcpy(void *, void const *, size_t);
+
+char *test(char *d, char *s, size_t n) {
+  // CHECK:      call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{.*}} %[[REG1:[^ ]+]], i8* {{.*}} %1, i64 %[[REG2:[^ ]+]], i1 false)
+  // CHECK-NEXT: %[[REGr:[^ ]+]] = getelementptr inbounds i8, i8* %[[REG1]], i64 %[[REG2]]
+  // CHECK-NEXT: ret i8* %[[REGr]]
+  return mempcpy(d, s, n);
+}

From 241f335b268dc19a0dfb85686a7c82fb339421a8 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Thu, 9 Jan 2020 16:31:45 +0000
Subject: [PATCH 83/89] [OpenCL][Docs] Rename C++ for OpenCL label

To avoid potential confusion with OpenCL C++.
---
 clang/docs/UsersManual.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index f55ffccc84d05..856d5e34bbcc2 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2661,7 +2661,8 @@ This will produce a generic test.bc file that can be used in vendor toolchains
 to perform machine code generation.
 
 Clang currently supports OpenCL C language standards up to v2.0. Starting from
-clang 9 a C++ mode is available for OpenCL (see :ref:`C++ for OpenCL <opencl_cpp>`).
+clang 9 a C++ mode is available for OpenCL (see
+:ref:`C++ for OpenCL <cxx_for_opencl>`).
 
 OpenCL Specific Options
 -----------------------
@@ -3024,7 +3025,7 @@ There are some standard OpenCL functions that are implemented as Clang builtins:
   enqueue query functions from `section 6.13.17.5
   <https://www.khronos.org/registry/cl/specs/opencl-2.0-openclc.pdf#171>`_.
 
-.. _opencl_cpp:
+.. _cxx_for_opencl:
 
 C++ for OpenCL
 --------------

From 6498aff249a1c3c6bad33137df3b90e2973722d6 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 9 Jan 2020 07:57:59 -0800
Subject: [PATCH 84/89] [lldb/Bindings] Move bindings into their own
 subdirectory

All the code required to generate the language bindings for Python and
Lua lives under scripts, even though the majority of this code aren't
scripts at all, and surrounded by scripts that are totally unrelated.

I've reorganized these files and moved everything related to the
language bindings into a new top-level directory named bindings. This
makes the corresponding files self contained and much more discoverable.

Differential revision: https://reviews.llvm.org/D72437
---
 lldb/CMakeLists.txt                                    | 10 +++++-----
 lldb/{scripts => bindings}/CMakeLists.txt              |  6 +++---
 lldb/{scripts => bindings}/headers.swig                |  0
 lldb/{scripts => bindings}/interface/SBAddress.i       |  0
 lldb/{scripts => bindings}/interface/SBAttachInfo.i    |  0
 lldb/{scripts => bindings}/interface/SBBlock.i         |  0
 lldb/{scripts => bindings}/interface/SBBreakpoint.i    |  0
 .../interface/SBBreakpointLocation.i                   |  0
 .../{scripts => bindings}/interface/SBBreakpointName.i |  0
 lldb/{scripts => bindings}/interface/SBBroadcaster.i   |  0
 .../interface/SBCommandInterpreter.i                   |  0
 .../interface/SBCommandReturnObject.i                  |  0
 lldb/{scripts => bindings}/interface/SBCommunication.i |  0
 lldb/{scripts => bindings}/interface/SBCompileUnit.i   |  0
 lldb/{scripts => bindings}/interface/SBData.i          |  0
 lldb/{scripts => bindings}/interface/SBDebugger.i      |  0
 lldb/{scripts => bindings}/interface/SBDeclaration.i   |  0
 lldb/{scripts => bindings}/interface/SBError.i         |  0
 lldb/{scripts => bindings}/interface/SBEvent.i         |  0
 .../interface/SBExecutionContext.i                     |  0
 .../interface/SBExpressionOptions.i                    |  0
 lldb/{scripts => bindings}/interface/SBFile.i          |  0
 lldb/{scripts => bindings}/interface/SBFileSpec.i      |  0
 lldb/{scripts => bindings}/interface/SBFileSpecList.i  |  0
 lldb/{scripts => bindings}/interface/SBFrame.i         |  0
 lldb/{scripts => bindings}/interface/SBFunction.i      |  0
 lldb/{scripts => bindings}/interface/SBHostOS.i        |  0
 lldb/{scripts => bindings}/interface/SBInstruction.i   |  0
 .../interface/SBInstructionList.i                      |  0
 .../interface/SBLanguageRuntime.i                      |  0
 lldb/{scripts => bindings}/interface/SBLaunchInfo.i    |  0
 lldb/{scripts => bindings}/interface/SBLineEntry.i     |  0
 lldb/{scripts => bindings}/interface/SBListener.i      |  0
 .../interface/SBMemoryRegionInfo.i                     |  0
 .../interface/SBMemoryRegionInfoList.i                 |  0
 lldb/{scripts => bindings}/interface/SBModule.i        |  0
 lldb/{scripts => bindings}/interface/SBModuleSpec.i    |  0
 lldb/{scripts => bindings}/interface/SBPlatform.i      |  0
 lldb/{scripts => bindings}/interface/SBProcess.i       |  0
 lldb/{scripts => bindings}/interface/SBProcessInfo.i   |  0
 lldb/{scripts => bindings}/interface/SBQueue.i         |  0
 lldb/{scripts => bindings}/interface/SBQueueItem.i     |  0
 lldb/{scripts => bindings}/interface/SBSection.i       |  0
 lldb/{scripts => bindings}/interface/SBSourceManager.i |  0
 lldb/{scripts => bindings}/interface/SBStream.i        |  0
 lldb/{scripts => bindings}/interface/SBStringList.i    |  0
 .../{scripts => bindings}/interface/SBStructuredData.i |  0
 lldb/{scripts => bindings}/interface/SBSymbol.i        |  0
 lldb/{scripts => bindings}/interface/SBSymbolContext.i |  0
 .../interface/SBSymbolContextList.i                    |  0
 lldb/{scripts => bindings}/interface/SBTarget.i        |  0
 lldb/{scripts => bindings}/interface/SBThread.i        |  0
 .../interface/SBThreadCollection.i                     |  0
 lldb/{scripts => bindings}/interface/SBThreadPlan.i    |  0
 lldb/{scripts => bindings}/interface/SBTrace.i         |  0
 lldb/{scripts => bindings}/interface/SBTraceOptions.i  |  0
 lldb/{scripts => bindings}/interface/SBType.i          |  0
 lldb/{scripts => bindings}/interface/SBTypeCategory.i  |  0
 .../{scripts => bindings}/interface/SBTypeEnumMember.i |  0
 lldb/{scripts => bindings}/interface/SBTypeFilter.i    |  0
 lldb/{scripts => bindings}/interface/SBTypeFormat.i    |  0
 .../interface/SBTypeNameSpecifier.i                    |  0
 lldb/{scripts => bindings}/interface/SBTypeSummary.i   |  0
 lldb/{scripts => bindings}/interface/SBTypeSynthetic.i |  0
 lldb/{scripts => bindings}/interface/SBUnixSignals.i   |  0
 lldb/{scripts => bindings}/interface/SBValue.i         |  0
 lldb/{scripts => bindings}/interface/SBValueList.i     |  0
 .../interface/SBVariablesOptions.i                     |  0
 lldb/{scripts => bindings}/interface/SBWatchpoint.i    |  0
 lldb/{scripts => bindings}/interfaces.swig             |  5 ++---
 lldb/{scripts/lldb_lua.swig => bindings/lua.swig}      |  0
 lldb/{scripts => bindings}/macros.swig                 |  0
 lldb/{scripts/lldb.swig => bindings/python.swig}       |  2 +-
 .../Python => bindings/python}/createPythonInit.py     |  0
 .../Python => bindings/python}/python-extensions.swig  |  0
 .../python}/python-swigsafecast.swig                   |  0
 .../Python => bindings/python}/python-typemaps.swig    |  0
 .../Python => bindings/python}/python-wrapper.swig     |  0
 lldb/docs/CMakeLists.txt                               |  4 ++--
 lldb/source/API/CMakeLists.txt                         |  8 ++++----
 80 files changed, 17 insertions(+), 18 deletions(-)
 rename lldb/{scripts => bindings}/CMakeLists.txt (93%)
 rename lldb/{scripts => bindings}/headers.swig (100%)
 rename lldb/{scripts => bindings}/interface/SBAddress.i (100%)
 rename lldb/{scripts => bindings}/interface/SBAttachInfo.i (100%)
 rename lldb/{scripts => bindings}/interface/SBBlock.i (100%)
 rename lldb/{scripts => bindings}/interface/SBBreakpoint.i (100%)
 rename lldb/{scripts => bindings}/interface/SBBreakpointLocation.i (100%)
 rename lldb/{scripts => bindings}/interface/SBBreakpointName.i (100%)
 rename lldb/{scripts => bindings}/interface/SBBroadcaster.i (100%)
 rename lldb/{scripts => bindings}/interface/SBCommandInterpreter.i (100%)
 rename lldb/{scripts => bindings}/interface/SBCommandReturnObject.i (100%)
 rename lldb/{scripts => bindings}/interface/SBCommunication.i (100%)
 rename lldb/{scripts => bindings}/interface/SBCompileUnit.i (100%)
 rename lldb/{scripts => bindings}/interface/SBData.i (100%)
 rename lldb/{scripts => bindings}/interface/SBDebugger.i (100%)
 rename lldb/{scripts => bindings}/interface/SBDeclaration.i (100%)
 rename lldb/{scripts => bindings}/interface/SBError.i (100%)
 rename lldb/{scripts => bindings}/interface/SBEvent.i (100%)
 rename lldb/{scripts => bindings}/interface/SBExecutionContext.i (100%)
 rename lldb/{scripts => bindings}/interface/SBExpressionOptions.i (100%)
 rename lldb/{scripts => bindings}/interface/SBFile.i (100%)
 rename lldb/{scripts => bindings}/interface/SBFileSpec.i (100%)
 rename lldb/{scripts => bindings}/interface/SBFileSpecList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBFrame.i (100%)
 rename lldb/{scripts => bindings}/interface/SBFunction.i (100%)
 rename lldb/{scripts => bindings}/interface/SBHostOS.i (100%)
 rename lldb/{scripts => bindings}/interface/SBInstruction.i (100%)
 rename lldb/{scripts => bindings}/interface/SBInstructionList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBLanguageRuntime.i (100%)
 rename lldb/{scripts => bindings}/interface/SBLaunchInfo.i (100%)
 rename lldb/{scripts => bindings}/interface/SBLineEntry.i (100%)
 rename lldb/{scripts => bindings}/interface/SBListener.i (100%)
 rename lldb/{scripts => bindings}/interface/SBMemoryRegionInfo.i (100%)
 rename lldb/{scripts => bindings}/interface/SBMemoryRegionInfoList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBModule.i (100%)
 rename lldb/{scripts => bindings}/interface/SBModuleSpec.i (100%)
 rename lldb/{scripts => bindings}/interface/SBPlatform.i (100%)
 rename lldb/{scripts => bindings}/interface/SBProcess.i (100%)
 rename lldb/{scripts => bindings}/interface/SBProcessInfo.i (100%)
 rename lldb/{scripts => bindings}/interface/SBQueue.i (100%)
 rename lldb/{scripts => bindings}/interface/SBQueueItem.i (100%)
 rename lldb/{scripts => bindings}/interface/SBSection.i (100%)
 rename lldb/{scripts => bindings}/interface/SBSourceManager.i (100%)
 rename lldb/{scripts => bindings}/interface/SBStream.i (100%)
 rename lldb/{scripts => bindings}/interface/SBStringList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBStructuredData.i (100%)
 rename lldb/{scripts => bindings}/interface/SBSymbol.i (100%)
 rename lldb/{scripts => bindings}/interface/SBSymbolContext.i (100%)
 rename lldb/{scripts => bindings}/interface/SBSymbolContextList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTarget.i (100%)
 rename lldb/{scripts => bindings}/interface/SBThread.i (100%)
 rename lldb/{scripts => bindings}/interface/SBThreadCollection.i (100%)
 rename lldb/{scripts => bindings}/interface/SBThreadPlan.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTrace.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTraceOptions.i (100%)
 rename lldb/{scripts => bindings}/interface/SBType.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeCategory.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeEnumMember.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeFilter.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeFormat.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeNameSpecifier.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeSummary.i (100%)
 rename lldb/{scripts => bindings}/interface/SBTypeSynthetic.i (100%)
 rename lldb/{scripts => bindings}/interface/SBUnixSignals.i (100%)
 rename lldb/{scripts => bindings}/interface/SBValue.i (100%)
 rename lldb/{scripts => bindings}/interface/SBValueList.i (100%)
 rename lldb/{scripts => bindings}/interface/SBVariablesOptions.i (100%)
 rename lldb/{scripts => bindings}/interface/SBWatchpoint.i (100%)
 rename lldb/{scripts => bindings}/interfaces.swig (99%)
 rename lldb/{scripts/lldb_lua.swig => bindings/lua.swig} (100%)
 rename lldb/{scripts => bindings}/macros.swig (100%)
 rename lldb/{scripts/lldb.swig => bindings/python.swig} (98%)
 rename lldb/{scripts/Python => bindings/python}/createPythonInit.py (100%)
 rename lldb/{scripts/Python => bindings/python}/python-extensions.swig (100%)
 rename lldb/{scripts/Python => bindings/python}/python-swigsafecast.swig (100%)
 rename lldb/{scripts/Python => bindings/python}/python-typemaps.swig (100%)
 rename lldb/{scripts/Python => bindings/python}/python-wrapper.swig (100%)

diff --git a/lldb/CMakeLists.txt b/lldb/CMakeLists.txt
index 6170ab625c54d..573b8556989e4 100644
--- a/lldb/CMakeLists.txt
+++ b/lldb/CMakeLists.txt
@@ -56,7 +56,7 @@ if (LLDB_ENABLE_PYTHON)
 endif ()
 
 if (LLDB_ENABLE_PYTHON OR LLDB_ENABLE_LUA)
-  add_subdirectory(scripts)
+  add_subdirectory(bindings)
 endif ()
 
 # We need the headers generated by instrinsics_gen before we can compile
@@ -97,7 +97,7 @@ if(LLDB_INCLUDE_TESTS)
 endif()
 
 if (LLDB_ENABLE_PYTHON)
-  get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
+  get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
 
   if(LLDB_BUILD_FRAMEWORK)
     set(lldb_python_build_path "${LLDB_FRAMEWORK_ABSOLUTE_BUILD_DIR}/LLDB.framework/Resources/Python/lldb")
@@ -109,7 +109,7 @@ if (LLDB_ENABLE_PYTHON)
   # to liblldb.so for the Python API(hardlink on Windows).
   add_custom_target(finish_swig ALL VERBATIM
     COMMAND ${CMAKE_COMMAND} -E make_directory ${lldb_python_build_path}
-    DEPENDS ${lldb_scripts_dir}/lldb.py
+    DEPENDS ${lldb_bindings_dir}/lldb.py
     COMMENT "Python script sym-linking LLDB Python API")
 
   if(NOT LLDB_USE_SYSTEM_SIX)
@@ -121,7 +121,7 @@ if (LLDB_ENABLE_PYTHON)
 
   add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
     COMMAND ${CMAKE_COMMAND} -E copy
-      "${lldb_scripts_dir}/lldb.py"
+      "${lldb_bindings_dir}/lldb.py"
       "${lldb_python_build_path}/__init__.py")
 
   function(create_python_package pkg_dir)
@@ -131,7 +131,7 @@ if (LLDB_ENABLE_PYTHON)
     endif()
     if(NOT ARG_NOINIT)
       set(init_cmd COMMAND ${PYTHON_EXECUTABLE}
-          "${LLDB_SOURCE_DIR}/scripts/Python/createPythonInit.py"
+          "${LLDB_SOURCE_DIR}/bindings/python/createPythonInit.py"
           "${pkg_dir}" ${ARG_FILES})
     endif()
     add_custom_command(TARGET finish_swig POST_BUILD VERBATIM
diff --git a/lldb/scripts/CMakeLists.txt b/lldb/bindings/CMakeLists.txt
similarity index 93%
rename from lldb/scripts/CMakeLists.txt
rename to lldb/bindings/CMakeLists.txt
index 515c63293bc20..92ae402c478e9 100644
--- a/lldb/scripts/CMakeLists.txt
+++ b/lldb/bindings/CMakeLists.txt
@@ -1,4 +1,4 @@
-file(GLOB SWIG_INTERFACES interface/*.i)
+file(GLOB SWIG_INTERFACES interfaces/*.i)
 file(GLOB_RECURSE SWIG_SOURCES *.swig)
 file(GLOB SWIG_HEADERS
   ${LLDB_SOURCE_DIR}/include/lldb/API/*.h
@@ -46,7 +46,7 @@ if (LLDB_ENABLE_PYTHON)
         -python
         -threads
         -o ${CMAKE_CURRENT_BINARY_DIR}/LLDBWrapPython.cpp
-        ${LLDB_SOURCE_DIR}/scripts/lldb.swig
+        ${LLDB_SOURCE_DIR}/bindings/python.swig
     VERBATIM
     COMMENT "Builds LLDB Python wrapper")
 
@@ -67,7 +67,7 @@ if (LLDB_ENABLE_LUA)
         -lua
         -w503
         -o ${CMAKE_CURRENT_BINARY_DIR}/LLDBWrapLua.cpp
-        ${LLDB_SOURCE_DIR}/scripts/lldb_lua.swig
+        ${LLDB_SOURCE_DIR}/bindings/lua.swig
     VERBATIM
     COMMENT "Builds LLDB Lua wrapper")
 
diff --git a/lldb/scripts/headers.swig b/lldb/bindings/headers.swig
similarity index 100%
rename from lldb/scripts/headers.swig
rename to lldb/bindings/headers.swig
diff --git a/lldb/scripts/interface/SBAddress.i b/lldb/bindings/interface/SBAddress.i
similarity index 100%
rename from lldb/scripts/interface/SBAddress.i
rename to lldb/bindings/interface/SBAddress.i
diff --git a/lldb/scripts/interface/SBAttachInfo.i b/lldb/bindings/interface/SBAttachInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBAttachInfo.i
rename to lldb/bindings/interface/SBAttachInfo.i
diff --git a/lldb/scripts/interface/SBBlock.i b/lldb/bindings/interface/SBBlock.i
similarity index 100%
rename from lldb/scripts/interface/SBBlock.i
rename to lldb/bindings/interface/SBBlock.i
diff --git a/lldb/scripts/interface/SBBreakpoint.i b/lldb/bindings/interface/SBBreakpoint.i
similarity index 100%
rename from lldb/scripts/interface/SBBreakpoint.i
rename to lldb/bindings/interface/SBBreakpoint.i
diff --git a/lldb/scripts/interface/SBBreakpointLocation.i b/lldb/bindings/interface/SBBreakpointLocation.i
similarity index 100%
rename from lldb/scripts/interface/SBBreakpointLocation.i
rename to lldb/bindings/interface/SBBreakpointLocation.i
diff --git a/lldb/scripts/interface/SBBreakpointName.i b/lldb/bindings/interface/SBBreakpointName.i
similarity index 100%
rename from lldb/scripts/interface/SBBreakpointName.i
rename to lldb/bindings/interface/SBBreakpointName.i
diff --git a/lldb/scripts/interface/SBBroadcaster.i b/lldb/bindings/interface/SBBroadcaster.i
similarity index 100%
rename from lldb/scripts/interface/SBBroadcaster.i
rename to lldb/bindings/interface/SBBroadcaster.i
diff --git a/lldb/scripts/interface/SBCommandInterpreter.i b/lldb/bindings/interface/SBCommandInterpreter.i
similarity index 100%
rename from lldb/scripts/interface/SBCommandInterpreter.i
rename to lldb/bindings/interface/SBCommandInterpreter.i
diff --git a/lldb/scripts/interface/SBCommandReturnObject.i b/lldb/bindings/interface/SBCommandReturnObject.i
similarity index 100%
rename from lldb/scripts/interface/SBCommandReturnObject.i
rename to lldb/bindings/interface/SBCommandReturnObject.i
diff --git a/lldb/scripts/interface/SBCommunication.i b/lldb/bindings/interface/SBCommunication.i
similarity index 100%
rename from lldb/scripts/interface/SBCommunication.i
rename to lldb/bindings/interface/SBCommunication.i
diff --git a/lldb/scripts/interface/SBCompileUnit.i b/lldb/bindings/interface/SBCompileUnit.i
similarity index 100%
rename from lldb/scripts/interface/SBCompileUnit.i
rename to lldb/bindings/interface/SBCompileUnit.i
diff --git a/lldb/scripts/interface/SBData.i b/lldb/bindings/interface/SBData.i
similarity index 100%
rename from lldb/scripts/interface/SBData.i
rename to lldb/bindings/interface/SBData.i
diff --git a/lldb/scripts/interface/SBDebugger.i b/lldb/bindings/interface/SBDebugger.i
similarity index 100%
rename from lldb/scripts/interface/SBDebugger.i
rename to lldb/bindings/interface/SBDebugger.i
diff --git a/lldb/scripts/interface/SBDeclaration.i b/lldb/bindings/interface/SBDeclaration.i
similarity index 100%
rename from lldb/scripts/interface/SBDeclaration.i
rename to lldb/bindings/interface/SBDeclaration.i
diff --git a/lldb/scripts/interface/SBError.i b/lldb/bindings/interface/SBError.i
similarity index 100%
rename from lldb/scripts/interface/SBError.i
rename to lldb/bindings/interface/SBError.i
diff --git a/lldb/scripts/interface/SBEvent.i b/lldb/bindings/interface/SBEvent.i
similarity index 100%
rename from lldb/scripts/interface/SBEvent.i
rename to lldb/bindings/interface/SBEvent.i
diff --git a/lldb/scripts/interface/SBExecutionContext.i b/lldb/bindings/interface/SBExecutionContext.i
similarity index 100%
rename from lldb/scripts/interface/SBExecutionContext.i
rename to lldb/bindings/interface/SBExecutionContext.i
diff --git a/lldb/scripts/interface/SBExpressionOptions.i b/lldb/bindings/interface/SBExpressionOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBExpressionOptions.i
rename to lldb/bindings/interface/SBExpressionOptions.i
diff --git a/lldb/scripts/interface/SBFile.i b/lldb/bindings/interface/SBFile.i
similarity index 100%
rename from lldb/scripts/interface/SBFile.i
rename to lldb/bindings/interface/SBFile.i
diff --git a/lldb/scripts/interface/SBFileSpec.i b/lldb/bindings/interface/SBFileSpec.i
similarity index 100%
rename from lldb/scripts/interface/SBFileSpec.i
rename to lldb/bindings/interface/SBFileSpec.i
diff --git a/lldb/scripts/interface/SBFileSpecList.i b/lldb/bindings/interface/SBFileSpecList.i
similarity index 100%
rename from lldb/scripts/interface/SBFileSpecList.i
rename to lldb/bindings/interface/SBFileSpecList.i
diff --git a/lldb/scripts/interface/SBFrame.i b/lldb/bindings/interface/SBFrame.i
similarity index 100%
rename from lldb/scripts/interface/SBFrame.i
rename to lldb/bindings/interface/SBFrame.i
diff --git a/lldb/scripts/interface/SBFunction.i b/lldb/bindings/interface/SBFunction.i
similarity index 100%
rename from lldb/scripts/interface/SBFunction.i
rename to lldb/bindings/interface/SBFunction.i
diff --git a/lldb/scripts/interface/SBHostOS.i b/lldb/bindings/interface/SBHostOS.i
similarity index 100%
rename from lldb/scripts/interface/SBHostOS.i
rename to lldb/bindings/interface/SBHostOS.i
diff --git a/lldb/scripts/interface/SBInstruction.i b/lldb/bindings/interface/SBInstruction.i
similarity index 100%
rename from lldb/scripts/interface/SBInstruction.i
rename to lldb/bindings/interface/SBInstruction.i
diff --git a/lldb/scripts/interface/SBInstructionList.i b/lldb/bindings/interface/SBInstructionList.i
similarity index 100%
rename from lldb/scripts/interface/SBInstructionList.i
rename to lldb/bindings/interface/SBInstructionList.i
diff --git a/lldb/scripts/interface/SBLanguageRuntime.i b/lldb/bindings/interface/SBLanguageRuntime.i
similarity index 100%
rename from lldb/scripts/interface/SBLanguageRuntime.i
rename to lldb/bindings/interface/SBLanguageRuntime.i
diff --git a/lldb/scripts/interface/SBLaunchInfo.i b/lldb/bindings/interface/SBLaunchInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBLaunchInfo.i
rename to lldb/bindings/interface/SBLaunchInfo.i
diff --git a/lldb/scripts/interface/SBLineEntry.i b/lldb/bindings/interface/SBLineEntry.i
similarity index 100%
rename from lldb/scripts/interface/SBLineEntry.i
rename to lldb/bindings/interface/SBLineEntry.i
diff --git a/lldb/scripts/interface/SBListener.i b/lldb/bindings/interface/SBListener.i
similarity index 100%
rename from lldb/scripts/interface/SBListener.i
rename to lldb/bindings/interface/SBListener.i
diff --git a/lldb/scripts/interface/SBMemoryRegionInfo.i b/lldb/bindings/interface/SBMemoryRegionInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBMemoryRegionInfo.i
rename to lldb/bindings/interface/SBMemoryRegionInfo.i
diff --git a/lldb/scripts/interface/SBMemoryRegionInfoList.i b/lldb/bindings/interface/SBMemoryRegionInfoList.i
similarity index 100%
rename from lldb/scripts/interface/SBMemoryRegionInfoList.i
rename to lldb/bindings/interface/SBMemoryRegionInfoList.i
diff --git a/lldb/scripts/interface/SBModule.i b/lldb/bindings/interface/SBModule.i
similarity index 100%
rename from lldb/scripts/interface/SBModule.i
rename to lldb/bindings/interface/SBModule.i
diff --git a/lldb/scripts/interface/SBModuleSpec.i b/lldb/bindings/interface/SBModuleSpec.i
similarity index 100%
rename from lldb/scripts/interface/SBModuleSpec.i
rename to lldb/bindings/interface/SBModuleSpec.i
diff --git a/lldb/scripts/interface/SBPlatform.i b/lldb/bindings/interface/SBPlatform.i
similarity index 100%
rename from lldb/scripts/interface/SBPlatform.i
rename to lldb/bindings/interface/SBPlatform.i
diff --git a/lldb/scripts/interface/SBProcess.i b/lldb/bindings/interface/SBProcess.i
similarity index 100%
rename from lldb/scripts/interface/SBProcess.i
rename to lldb/bindings/interface/SBProcess.i
diff --git a/lldb/scripts/interface/SBProcessInfo.i b/lldb/bindings/interface/SBProcessInfo.i
similarity index 100%
rename from lldb/scripts/interface/SBProcessInfo.i
rename to lldb/bindings/interface/SBProcessInfo.i
diff --git a/lldb/scripts/interface/SBQueue.i b/lldb/bindings/interface/SBQueue.i
similarity index 100%
rename from lldb/scripts/interface/SBQueue.i
rename to lldb/bindings/interface/SBQueue.i
diff --git a/lldb/scripts/interface/SBQueueItem.i b/lldb/bindings/interface/SBQueueItem.i
similarity index 100%
rename from lldb/scripts/interface/SBQueueItem.i
rename to lldb/bindings/interface/SBQueueItem.i
diff --git a/lldb/scripts/interface/SBSection.i b/lldb/bindings/interface/SBSection.i
similarity index 100%
rename from lldb/scripts/interface/SBSection.i
rename to lldb/bindings/interface/SBSection.i
diff --git a/lldb/scripts/interface/SBSourceManager.i b/lldb/bindings/interface/SBSourceManager.i
similarity index 100%
rename from lldb/scripts/interface/SBSourceManager.i
rename to lldb/bindings/interface/SBSourceManager.i
diff --git a/lldb/scripts/interface/SBStream.i b/lldb/bindings/interface/SBStream.i
similarity index 100%
rename from lldb/scripts/interface/SBStream.i
rename to lldb/bindings/interface/SBStream.i
diff --git a/lldb/scripts/interface/SBStringList.i b/lldb/bindings/interface/SBStringList.i
similarity index 100%
rename from lldb/scripts/interface/SBStringList.i
rename to lldb/bindings/interface/SBStringList.i
diff --git a/lldb/scripts/interface/SBStructuredData.i b/lldb/bindings/interface/SBStructuredData.i
similarity index 100%
rename from lldb/scripts/interface/SBStructuredData.i
rename to lldb/bindings/interface/SBStructuredData.i
diff --git a/lldb/scripts/interface/SBSymbol.i b/lldb/bindings/interface/SBSymbol.i
similarity index 100%
rename from lldb/scripts/interface/SBSymbol.i
rename to lldb/bindings/interface/SBSymbol.i
diff --git a/lldb/scripts/interface/SBSymbolContext.i b/lldb/bindings/interface/SBSymbolContext.i
similarity index 100%
rename from lldb/scripts/interface/SBSymbolContext.i
rename to lldb/bindings/interface/SBSymbolContext.i
diff --git a/lldb/scripts/interface/SBSymbolContextList.i b/lldb/bindings/interface/SBSymbolContextList.i
similarity index 100%
rename from lldb/scripts/interface/SBSymbolContextList.i
rename to lldb/bindings/interface/SBSymbolContextList.i
diff --git a/lldb/scripts/interface/SBTarget.i b/lldb/bindings/interface/SBTarget.i
similarity index 100%
rename from lldb/scripts/interface/SBTarget.i
rename to lldb/bindings/interface/SBTarget.i
diff --git a/lldb/scripts/interface/SBThread.i b/lldb/bindings/interface/SBThread.i
similarity index 100%
rename from lldb/scripts/interface/SBThread.i
rename to lldb/bindings/interface/SBThread.i
diff --git a/lldb/scripts/interface/SBThreadCollection.i b/lldb/bindings/interface/SBThreadCollection.i
similarity index 100%
rename from lldb/scripts/interface/SBThreadCollection.i
rename to lldb/bindings/interface/SBThreadCollection.i
diff --git a/lldb/scripts/interface/SBThreadPlan.i b/lldb/bindings/interface/SBThreadPlan.i
similarity index 100%
rename from lldb/scripts/interface/SBThreadPlan.i
rename to lldb/bindings/interface/SBThreadPlan.i
diff --git a/lldb/scripts/interface/SBTrace.i b/lldb/bindings/interface/SBTrace.i
similarity index 100%
rename from lldb/scripts/interface/SBTrace.i
rename to lldb/bindings/interface/SBTrace.i
diff --git a/lldb/scripts/interface/SBTraceOptions.i b/lldb/bindings/interface/SBTraceOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBTraceOptions.i
rename to lldb/bindings/interface/SBTraceOptions.i
diff --git a/lldb/scripts/interface/SBType.i b/lldb/bindings/interface/SBType.i
similarity index 100%
rename from lldb/scripts/interface/SBType.i
rename to lldb/bindings/interface/SBType.i
diff --git a/lldb/scripts/interface/SBTypeCategory.i b/lldb/bindings/interface/SBTypeCategory.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeCategory.i
rename to lldb/bindings/interface/SBTypeCategory.i
diff --git a/lldb/scripts/interface/SBTypeEnumMember.i b/lldb/bindings/interface/SBTypeEnumMember.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeEnumMember.i
rename to lldb/bindings/interface/SBTypeEnumMember.i
diff --git a/lldb/scripts/interface/SBTypeFilter.i b/lldb/bindings/interface/SBTypeFilter.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeFilter.i
rename to lldb/bindings/interface/SBTypeFilter.i
diff --git a/lldb/scripts/interface/SBTypeFormat.i b/lldb/bindings/interface/SBTypeFormat.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeFormat.i
rename to lldb/bindings/interface/SBTypeFormat.i
diff --git a/lldb/scripts/interface/SBTypeNameSpecifier.i b/lldb/bindings/interface/SBTypeNameSpecifier.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeNameSpecifier.i
rename to lldb/bindings/interface/SBTypeNameSpecifier.i
diff --git a/lldb/scripts/interface/SBTypeSummary.i b/lldb/bindings/interface/SBTypeSummary.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeSummary.i
rename to lldb/bindings/interface/SBTypeSummary.i
diff --git a/lldb/scripts/interface/SBTypeSynthetic.i b/lldb/bindings/interface/SBTypeSynthetic.i
similarity index 100%
rename from lldb/scripts/interface/SBTypeSynthetic.i
rename to lldb/bindings/interface/SBTypeSynthetic.i
diff --git a/lldb/scripts/interface/SBUnixSignals.i b/lldb/bindings/interface/SBUnixSignals.i
similarity index 100%
rename from lldb/scripts/interface/SBUnixSignals.i
rename to lldb/bindings/interface/SBUnixSignals.i
diff --git a/lldb/scripts/interface/SBValue.i b/lldb/bindings/interface/SBValue.i
similarity index 100%
rename from lldb/scripts/interface/SBValue.i
rename to lldb/bindings/interface/SBValue.i
diff --git a/lldb/scripts/interface/SBValueList.i b/lldb/bindings/interface/SBValueList.i
similarity index 100%
rename from lldb/scripts/interface/SBValueList.i
rename to lldb/bindings/interface/SBValueList.i
diff --git a/lldb/scripts/interface/SBVariablesOptions.i b/lldb/bindings/interface/SBVariablesOptions.i
similarity index 100%
rename from lldb/scripts/interface/SBVariablesOptions.i
rename to lldb/bindings/interface/SBVariablesOptions.i
diff --git a/lldb/scripts/interface/SBWatchpoint.i b/lldb/bindings/interface/SBWatchpoint.i
similarity index 100%
rename from lldb/scripts/interface/SBWatchpoint.i
rename to lldb/bindings/interface/SBWatchpoint.i
diff --git a/lldb/scripts/interfaces.swig b/lldb/bindings/interfaces.swig
similarity index 99%
rename from lldb/scripts/interfaces.swig
rename to lldb/bindings/interfaces.swig
index cc6bb91febdac..780fe34392ff5 100644
--- a/lldb/scripts/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -32,8 +32,8 @@
 %include "./interface/SBEvent.i"
 %include "./interface/SBExecutionContext.i"
 %include "./interface/SBExpressionOptions.i"
-%include "./interface/SBFileSpec.i"
 %include "./interface/SBFile.i"
+%include "./interface/SBFileSpec.i"
 %include "./interface/SBFileSpecList.i"
 %include "./interface/SBFrame.i"
 %include "./interface/SBFunction.i"
@@ -75,9 +75,8 @@
 %include "./interface/SBTypeNameSpecifier.i"
 %include "./interface/SBTypeSummary.i"
 %include "./interface/SBTypeSynthetic.i"
+%include "./interface/SBUnixSignals.i"
 %include "./interface/SBValue.i"
 %include "./interface/SBValueList.i"
 %include "./interface/SBVariablesOptions.i"
 %include "./interface/SBWatchpoint.i"
-%include "./interface/SBUnixSignals.i"
-
diff --git a/lldb/scripts/lldb_lua.swig b/lldb/bindings/lua.swig
similarity index 100%
rename from lldb/scripts/lldb_lua.swig
rename to lldb/bindings/lua.swig
diff --git a/lldb/scripts/macros.swig b/lldb/bindings/macros.swig
similarity index 100%
rename from lldb/scripts/macros.swig
rename to lldb/bindings/macros.swig
diff --git a/lldb/scripts/lldb.swig b/lldb/bindings/python.swig
similarity index 98%
rename from lldb/scripts/lldb.swig
rename to lldb/bindings/python.swig
index bebf9bf534a34..cf716da4a477a 100644
--- a/lldb/scripts/lldb.swig
+++ b/lldb/bindings/python.swig
@@ -117,7 +117,7 @@ def lldb_iter(obj, getsize, getelem):
 
 %{
 #include "../source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
-#include "../scripts/Python/python-swigsafecast.swig"
+#include "../bindings/python/python-swigsafecast.swig"
 using namespace lldb_private;
 using namespace lldb_private::python;
 using namespace lldb;
diff --git a/lldb/scripts/Python/createPythonInit.py b/lldb/bindings/python/createPythonInit.py
similarity index 100%
rename from lldb/scripts/Python/createPythonInit.py
rename to lldb/bindings/python/createPythonInit.py
diff --git a/lldb/scripts/Python/python-extensions.swig b/lldb/bindings/python/python-extensions.swig
similarity index 100%
rename from lldb/scripts/Python/python-extensions.swig
rename to lldb/bindings/python/python-extensions.swig
diff --git a/lldb/scripts/Python/python-swigsafecast.swig b/lldb/bindings/python/python-swigsafecast.swig
similarity index 100%
rename from lldb/scripts/Python/python-swigsafecast.swig
rename to lldb/bindings/python/python-swigsafecast.swig
diff --git a/lldb/scripts/Python/python-typemaps.swig b/lldb/bindings/python/python-typemaps.swig
similarity index 100%
rename from lldb/scripts/Python/python-typemaps.swig
rename to lldb/bindings/python/python-typemaps.swig
diff --git a/lldb/scripts/Python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
similarity index 100%
rename from lldb/scripts/Python/python-wrapper.swig
rename to lldb/bindings/python/python-wrapper.swig
diff --git a/lldb/docs/CMakeLists.txt b/lldb/docs/CMakeLists.txt
index 0082d004bd0d6..8fa46860e5cec 100644
--- a/lldb/docs/CMakeLists.txt
+++ b/lldb/docs/CMakeLists.txt
@@ -30,9 +30,9 @@ if (LLDB_ENABLE_PYTHON)
     # Because we don't build liblldb, epydoc will complain that the import of
     # _lldb.so failed, but that doesn't prevent it from generating the docs.
     file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lldb)
-    get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
+    get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
     add_custom_target(lldb-python-doc-package
-      COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_scripts_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
+      COMMAND "${CMAKE_COMMAND}" -E copy "${lldb_bindings_dir}/lldb.py" "${CMAKE_CURRENT_BINARY_DIR}/lldb/__init__.py"
       COMMENT "Copying lldb.py to pretend package.")
     add_dependencies(lldb-python-doc-package swig_wrapper)
 
diff --git a/lldb/source/API/CMakeLists.txt b/lldb/source/API/CMakeLists.txt
index eea409bed185a..e0ecf29b502b7 100644
--- a/lldb/source/API/CMakeLists.txt
+++ b/lldb/source/API/CMakeLists.txt
@@ -5,13 +5,13 @@ endif()
 get_property(LLDB_ALL_PLUGINS GLOBAL PROPERTY LLDB_PLUGINS)
 
 if(LLDB_ENABLE_PYTHON)
-  get_target_property(lldb_scripts_dir swig_wrapper BINARY_DIR)
-  set(lldb_python_wrapper ${lldb_scripts_dir}/LLDBWrapPython.cpp)
+  get_target_property(lldb_bindings_dir swig_wrapper BINARY_DIR)
+  set(lldb_python_wrapper ${lldb_bindings_dir}/LLDBWrapPython.cpp)
 endif()
 
 if(LLDB_ENABLE_LUA)
-  get_target_property(lldb_scripts_dir swig_wrapper_lua BINARY_DIR)
-  set(lldb_lua_wrapper ${lldb_scripts_dir}/LLDBWrapLua.cpp)
+  get_target_property(lldb_bindings_dir swig_wrapper_lua BINARY_DIR)
+  set(lldb_lua_wrapper ${lldb_bindings_dir}/LLDBWrapLua.cpp)
 endif()
 
 if(LLDB_BUILD_FRAMEWORK)

From 189aa5b7a4584677ad628ecc2c369db61d4d2515 Mon Sep 17 00:00:00 2001
From: Kadir Cetinkaya <kadircet@google.com>
Date: Thu, 9 Jan 2020 16:54:22 +0100
Subject: [PATCH 85/89] [clangd] Adjust diagnostic range to be inside main file

Summary:
LSP requires diagnostics to lay inside main file. In clangd we keep
diagnostics in three different cases:
- already in main file
- adjusted to a header included in main file
- has a note covering some range in main file

In the last case, we were not adjusting the diagnostics range to be in main
file, therefore these diagnostics ended up pointing some arbitrary locations.

This patch fixes that issue by adjusting the range of diagnostics to be the
first note inside main file when converting to LSP.

Reviewers: ilya-biryukov

Subscribers: MaskRay, jkorous, arphaman, usaxena95, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D72458
---
 clang-tools-extra/clangd/Diagnostics.cpp      | 28 +++++++++++++------
 .../clangd/unittests/DiagnosticsTests.cpp     | 23 +++++++++++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/clang-tools-extra/clangd/Diagnostics.cpp b/clang-tools-extra/clangd/Diagnostics.cpp
index e78df0322eb32..ad8f6c8bef9a1 100644
--- a/clang-tools-extra/clangd/Diagnostics.cpp
+++ b/clang-tools-extra/clangd/Diagnostics.cpp
@@ -22,6 +22,8 @@
 #include "clang/Lex/Token.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
@@ -328,14 +330,22 @@ CodeAction toCodeAction(const Fix &F, const URIForFile &File) {
 void toLSPDiags(
     const Diag &D, const URIForFile &File, const ClangdDiagnosticOptions &Opts,
     llvm::function_ref<void(clangd::Diagnostic, llvm::ArrayRef<Fix>)> OutFn) {
-  auto FillBasicFields = [](const DiagBase &D) -> clangd::Diagnostic {
-    clangd::Diagnostic Res;
-    Res.range = D.Range;
-    Res.severity = getSeverity(D.Severity);
-    return Res;
-  };
+  clangd::Diagnostic Main;
+  Main.severity = getSeverity(D.Severity);
+
+  // Main diagnostic should always refer to a range inside main file. If a
+  // diagnostic made it so for, it means either itself or one of its notes is
+  // inside main file.
+  if (D.InsideMainFile) {
+    Main.range = D.Range;
+  } else {
+    auto It =
+        llvm::find_if(D.Notes, [](const Note &N) { return N.InsideMainFile; });
+    assert(It != D.Notes.end() &&
+           "neither the main diagnostic nor notes are inside main file");
+    Main.range = It->Range;
+  }
 
-  clangd::Diagnostic Main = FillBasicFields(D);
   Main.code = D.Name;
   switch (D.Source) {
   case Diag::Clang:
@@ -379,7 +389,9 @@ void toLSPDiags(
     for (auto &Note : D.Notes) {
       if (!Note.InsideMainFile)
         continue;
-      clangd::Diagnostic Res = FillBasicFields(Note);
+      clangd::Diagnostic Res;
+      Res.severity = getSeverity(Note.Severity);
+      Res.range = Note.Range;
       Res.message = noteMessage(D, Note, Opts);
       OutFn(std::move(Res), llvm::ArrayRef<Fix>());
     }
diff --git a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
index 0941af25213ca..ef73519ef1385 100644
--- a/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
+++ b/clang-tools-extra/clangd/unittests/DiagnosticsTests.cpp
@@ -1014,6 +1014,29 @@ TEST(IgnoreDiags, FromNonWrittenInclude) {
   EXPECT_THAT(TU.build().getDiagnostics(), UnorderedElementsAre());
 }
 
+TEST(ToLSPDiag, RangeIsInMain) {
+  ClangdDiagnosticOptions Opts;
+  clangd::Diag D;
+  D.Range = {pos(1, 2), pos(3, 4)};
+  D.Notes.emplace_back();
+  Note &N = D.Notes.back();
+  N.Range = {pos(2, 3), pos(3, 4)};
+
+  D.InsideMainFile = true;
+  N.InsideMainFile = false;
+  toLSPDiags(D, {}, Opts,
+             [&](clangd::Diagnostic LSPDiag, ArrayRef<clangd::Fix>) {
+               EXPECT_EQ(LSPDiag.range, D.Range);
+             });
+
+  D.InsideMainFile = false;
+  N.InsideMainFile = true;
+  toLSPDiags(D, {}, Opts,
+             [&](clangd::Diagnostic LSPDiag, ArrayRef<clangd::Fix>) {
+               EXPECT_EQ(LSPDiag.range, N.Range);
+             });
+}
+
 } // namespace
 } // namespace clangd
 } // namespace clang

From 5e0bf6772e2ca450d3433fca8b47ce7bac5a6cc7 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 9 Jan 2020 09:22:04 -0800
Subject: [PATCH 86/89] [lldb/SWIG] Fix capitalization for case sensitive file
 systems.

When moving the Python directory I renamed it to python (lowercase) but
didn't update the python.swig file.
---
 lldb/bindings/python.swig | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/bindings/python.swig b/lldb/bindings/python.swig
index cf716da4a477a..56fab9ff17951 100644
--- a/lldb/bindings/python.swig
+++ b/lldb/bindings/python.swig
@@ -111,12 +111,12 @@ def lldb_iter(obj, getsize, getelem):
 %}
 
 %include <std_string.i>
-%include "./Python/python-typemaps.swig"
+%include "./python/python-typemaps.swig"
 %include "./macros.swig"
 %include "./headers.swig"
 
 %{
-#include "../source/Plugins/ScriptInterpreter/Python/PythonDataObjects.h"
+#include "../source/Plugins/ScriptInterpreter/python/PythonDataObjects.h"
 #include "../bindings/python/python-swigsafecast.swig"
 using namespace lldb_private;
 using namespace lldb_private::python;
@@ -124,8 +124,8 @@ using namespace lldb;
 %}
 
 %include "./interfaces.swig"
-%include "./Python/python-extensions.swig"
-%include "./Python/python-wrapper.swig"
+%include "./python/python-extensions.swig"
+%include "./python/python-wrapper.swig"
 
 %pythoncode%{
 debugger_unique_id = 0

From 06d07ec4a372b55e6fb77bf0b97964bde16a3184 Mon Sep 17 00:00:00 2001
From: Simon Tatham <simon.tatham@arm.com>
Date: Thu, 9 Jan 2020 17:01:13 +0000
Subject: [PATCH 87/89] [Clang] Handle target-specific builtins returning
 aggregates.

Summary:
A few of the ARM MVE builtins directly return a structure type. This
causes an assertion failure at code-gen time if you try to assign the
result of the builtin to a variable, because the `RValue` created in
`EmitBuiltinExpr` from the `llvm::Value` produced by codegen is always
made by `RValue::get()`, which creates a non-aggregate `RValue` that
will fail an assertion when `AggExprEmitter::withReturnValueSlot` calls
`Src.getAggregatePointer()`. A similar failure occurs if you try to use
the struct return value directly to extract one field, e.g.
`vld2q(address).val[0]`.

The existing code-gen tests for those MVE builtins pass the returned
structure type directly to the C `return` statement, which apparently
managed to avoid that particular code path, so we didn't notice the
crash.

Now `EmitBuiltinExpr` checks the evaluation kind of the builtin's return
value, and does the necessary handling for aggregate returns. I've added
two extra test cases, both of which crashed before this change.

Reviewers: dmgreen, rjmccall

Reviewed By: rjmccall

Subscribers: kristof.beyls, cfe-commits

Tags: #clang

Differential Revision: https://reviews.llvm.org/D72271
---
 clang/lib/CodeGen/CGBuiltin.cpp               | 26 ++++++++++--
 clang/test/CodeGen/arm-mve-intrinsics/vld24.c | 42 +++++++++++++++++++
 2 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 3fadf09c460d6..2842fe8266364 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4332,9 +4332,29 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     return RValue::get(V);
   }
 
-  // See if we have a target specific builtin that needs to be lowered.
-  if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue))
-    return RValue::get(V);
+  // Some target-specific builtins can have aggregate return values, e.g.
+  // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
+  // ReturnValue to be non-null, so that the target-specific emission code can
+  // always just emit into it.
+  TypeEvaluationKind EvalKind = getEvaluationKind(E->getType());
+  if (EvalKind == TEK_Aggregate && ReturnValue.isNull()) {
+    Address DestPtr = CreateMemTemp(E->getType(), "agg.tmp");
+    ReturnValue = ReturnValueSlot(DestPtr, false);
+  }
+
+  // Now see if we can emit a target-specific builtin.
+  if (Value *V = EmitTargetBuiltinExpr(BuiltinID, E, ReturnValue)) {
+    switch (EvalKind) {
+    case TEK_Scalar:
+      return RValue::get(V);
+    case TEK_Aggregate:
+      return RValue::getAggregate(ReturnValue.getValue(),
+                                  ReturnValue.isVolatile());
+    case TEK_Complex:
+      llvm_unreachable("No current target builtin returns complex");
+    }
+    llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
+  }
 
   ErrorUnsupported(E, "builtin function");
 
diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
index 984d5989217e1..a0f37fe65d3de 100644
--- a/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
+++ b/clang/test/CodeGen/arm-mve-intrinsics/vld24.c
@@ -98,3 +98,45 @@ void test_vst2q_f16(float16_t *addr, float16x8x2_t value)
     vst2q_f16(addr, value);
 #endif /* POLYMORPHIC */
 }
+
+// CHECK-LABEL: @load_into_variable(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.arm.mve.vld2q.v8i16.p0i16(i16* [[ADDR:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T:%.*]] undef, <8 x i16> [[TMP1]], 0, 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_UINT16X8X2_T]] [[TMP2]], <8 x i16> [[TMP3]], 0, 1
+// CHECK-NEXT:    store <8 x i16> [[TMP1]], <8 x i16>* [[VALUES:%.*]], align 8
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds <8 x i16>, <8 x i16>* [[VALUES]], i32 1
+// CHECK-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* [[ARRAYIDX4]], align 8
+// CHECK-NEXT:    ret void
+//
+void load_into_variable(const uint16_t *addr, uint16x8_t *values)
+{
+    uint16x8x2_t v;
+#ifdef POLYMORPHIC
+    v = vld2q(addr);
+#else /* POLYMORPHIC */
+    v = vld2q_u16(addr);
+#endif /* POLYMORPHIC */
+    values[0] = v.val[0];
+    values[1] = v.val[1];
+}
+
+// CHECK-LABEL: @extract_one_vector(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vld2q.v4i32.p0i32(i32* [[ADDR:%.*]])
+// CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP0]], 0
+// CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[STRUCT_INT32X4X2_T:%.*]] undef, <4 x i32> [[TMP1]], 0, 0
+// CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP0]], 1
+// CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [[STRUCT_INT32X4X2_T]] [[TMP2]], <4 x i32> [[TMP3]], 0, 1
+// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+//
+int32x4_t extract_one_vector(const int32_t *addr)
+{
+#ifdef POLYMORPHIC
+    return vld2q(addr).val[0];
+#else /* POLYMORPHIC */
+    return vld2q_s32(addr).val[0];
+#endif /* POLYMORPHIC */
+}

From ed6daa2e1d32e940ac4bd8c31ae05154c5ed9bd3 Mon Sep 17 00:00:00 2001
From: Bruno Ricci <riccibrun@gmail.com>
Date: Thu, 9 Jan 2020 17:11:29 +0000
Subject: [PATCH 88/89] [Support][NFC] Add a comment about the semantics of
 MF_HUGE_HINT flag

---
 llvm/include/llvm/Support/Memory.h | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/include/llvm/Support/Memory.h b/llvm/include/llvm/Support/Memory.h
index 6f22dd7080cd5..c0454223b2fd1 100644
--- a/llvm/include/llvm/Support/Memory.h
+++ b/llvm/include/llvm/Support/Memory.h
@@ -57,6 +57,17 @@ namespace sys {
       MF_WRITE = 0x2000000,
       MF_EXEC = 0x4000000,
       MF_RWE_MASK = 0x7000000,
+
+      /// The \p MF_HUGE_HINT flag is used to indicate that the request for
+      /// a memory block should be satisfied with large pages if possible.
+      /// This is only a hint and small pages will be used as fallback.
+      ///
+      /// The presence or absence of this flag in the returned memory block
+      /// is (at least currently) *not* a reliable indicator that the memory
+      /// block will use or will not use large pages. On some systems a request
+      /// without this flag can be backed by large pages without this flag being
+      /// set, and on some other systems a request with this flag can fallback
+      /// to small pages without this flag being cleared.
       MF_HUGE_HINT = 0x0000001
     };
 

From 356b33516c2e0ef241066dded16d7ecc1f7aa8cc Mon Sep 17 00:00:00 2001
From: Simon Moll <simon.moll@emea.nec.com>
Date: Thu, 9 Jan 2020 18:37:08 +0100
Subject: [PATCH 89/89] [NFC,format] Sort switch cases alphabetically

This patch brings the switch cases of `llvm/lib/Support/Triple.cpp` back into alphabetical order.
This was noted during the the review of  https://reviews.llvm.org/D69103

Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D72452
---
 llvm/lib/Support/Triple.cpp | 266 ++++++++++++++++++------------------
 1 file changed, 133 insertions(+), 133 deletions(-)

diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 2af9ec7b9bc81..2c480c1094a5c 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -21,56 +21,56 @@ StringRef Triple::getArchTypeName(ArchType Kind) {
   case UnknownArch:    return "unknown";
 
   case aarch64:        return "aarch64";
-  case aarch64_be:     return "aarch64_be";
   case aarch64_32:     return "aarch64_32";
+  case aarch64_be:     return "aarch64_be";
+  case amdgcn:         return "amdgcn";
+  case amdil64:        return "amdil64";
+  case amdil:          return "amdil";
+  case arc:            return "arc";
   case arm:            return "arm";
   case armeb:          return "armeb";
-  case arc:            return "arc";
   case avr:            return "avr";
-  case bpfel:          return "bpfel";
   case bpfeb:          return "bpfeb";
+  case bpfel:          return "bpfel";
   case hexagon:        return "hexagon";
-  case mips:           return "mips";
-  case mipsel:         return "mipsel";
+  case hsail64:        return "hsail64";
+  case hsail:          return "hsail";
+  case kalimba:        return "kalimba";
+  case lanai:          return "lanai";
+  case le32:           return "le32";
+  case le64:           return "le64";
   case mips64:         return "mips64";
   case mips64el:       return "mips64el";
+  case mips:           return "mips";
+  case mipsel:         return "mipsel";
   case msp430:         return "msp430";
+  case nvptx64:        return "nvptx64";
+  case nvptx:          return "nvptx";
   case ppc64:          return "powerpc64";
   case ppc64le:        return "powerpc64le";
   case ppc:            return "powerpc";
   case r600:           return "r600";
-  case amdgcn:         return "amdgcn";
+  case renderscript32: return "renderscript32";
+  case renderscript64: return "renderscript64";
   case riscv32:        return "riscv32";
   case riscv64:        return "riscv64";
+  case shave:          return "shave";
   case sparc:          return "sparc";
-  case sparcv9:        return "sparcv9";
   case sparcel:        return "sparcel";
+  case sparcv9:        return "sparcv9";
+  case spir64:         return "spir64";
+  case spir:           return "spir";
   case systemz:        return "s390x";
   case tce:            return "tce";
   case tcele:          return "tcele";
   case thumb:          return "thumb";
   case thumbeb:        return "thumbeb";
+  case ve:             return "ve";
+  case wasm32:         return "wasm32";
+  case wasm64:         return "wasm64";
   case x86:            return "i386";
   case x86_64:         return "x86_64";
   case xcore:          return "xcore";
-  case nvptx:          return "nvptx";
-  case nvptx64:        return "nvptx64";
-  case le32:           return "le32";
-  case le64:           return "le64";
-  case amdil:          return "amdil";
-  case amdil64:        return "amdil64";
-  case hsail:          return "hsail";
-  case hsail64:        return "hsail64";
-  case spir:           return "spir";
-  case spir64:         return "spir64";
-  case kalimba:        return "kalimba";
-  case lanai:          return "lanai";
-  case shave:          return "shave";
-  case wasm32:         return "wasm32";
-  case wasm64:         return "wasm64";
-  case renderscript32: return "renderscript32";
-  case renderscript64: return "renderscript64";
-  case ve:             return "ve";
   }
 
   llvm_unreachable("Invalid ArchType!");
@@ -154,22 +154,22 @@ StringRef Triple::getVendorTypeName(VendorType Kind) {
   switch (Kind) {
   case UnknownVendor: return "unknown";
 
+  case AMD: return "amd";
   case Apple: return "apple";
-  case PC: return "pc";
-  case SCEI: return "scei";
   case BGP: return "bgp";
   case BGQ: return "bgq";
+  case CSR: return "csr";
   case Freescale: return "fsl";
   case IBM: return "ibm";
   case ImaginationTechnologies: return "img";
+  case Mesa: return "mesa";
   case MipsTechnologies: return "mti";
-  case NVIDIA: return "nvidia";
-  case CSR: return "csr";
   case Myriad: return "myriad";
-  case AMD: return "amd";
-  case Mesa: return "mesa";
-  case SUSE: return "suse";
+  case NVIDIA: return "nvidia";
   case OpenEmbedded: return "oe";
+  case PC: return "pc";
+  case SCEI: return "scei";
+  case SUSE: return "suse";
   }
 
   llvm_unreachable("Invalid VendorType!");
@@ -179,41 +179,41 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   switch (Kind) {
   case UnknownOS: return "unknown";
 
+  case AIX: return "aix";
+  case AMDHSA: return "amdhsa";
+  case AMDPAL: return "amdpal";
   case Ananas: return "ananas";
+  case CNK: return "cnk";
+  case CUDA: return "cuda";
   case CloudABI: return "cloudabi";
+  case Contiki: return "contiki";
   case Darwin: return "darwin";
   case DragonFly: return "dragonfly";
+  case ELFIAMCU: return "elfiamcu";
+  case Emscripten: return "emscripten";
   case FreeBSD: return "freebsd";
   case Fuchsia: return "fuchsia";
+  case Haiku: return "haiku";
+  case HermitCore: return "hermit";
+  case Hurd: return "hurd";
   case IOS: return "ios";
   case KFreeBSD: return "kfreebsd";
   case Linux: return "linux";
   case Lv2: return "lv2";
   case MacOSX: return "macosx";
-  case NetBSD: return "netbsd";
-  case OpenBSD: return "openbsd";
-  case Solaris: return "solaris";
-  case Win32: return "windows";
-  case Haiku: return "haiku";
+  case Mesa3D: return "mesa3d";
   case Minix: return "minix";
-  case RTEMS: return "rtems";
-  case NaCl: return "nacl";
-  case CNK: return "cnk";
-  case AIX: return "aix";
-  case CUDA: return "cuda";
   case NVCL: return "nvcl";
-  case AMDHSA: return "amdhsa";
+  case NaCl: return "nacl";
+  case NetBSD: return "netbsd";
+  case OpenBSD: return "openbsd";
   case PS4: return "ps4";
-  case ELFIAMCU: return "elfiamcu";
+  case RTEMS: return "rtems";
+  case Solaris: return "solaris";
   case TvOS: return "tvos";
-  case WatchOS: return "watchos";
-  case Mesa3D: return "mesa3d";
-  case Contiki: return "contiki";
-  case AMDPAL: return "amdpal";
-  case HermitCore: return "hermit";
-  case Hurd: return "hurd";
   case WASI: return "wasi";
-  case Emscripten: return "emscripten";
+  case WatchOS: return "watchos";
+  case Win32: return "windows";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -222,25 +222,25 @@ StringRef Triple::getOSTypeName(OSType Kind) {
 StringRef Triple::getEnvironmentTypeName(EnvironmentType Kind) {
   switch (Kind) {
   case UnknownEnvironment: return "unknown";
+  case Android: return "android";
+  case CODE16: return "code16";
+  case CoreCLR: return "coreclr";
+  case Cygnus: return "cygnus";
+  case EABI: return "eabi";
+  case EABIHF: return "eabihf";
   case GNU: return "gnu";
-  case GNUABIN32: return "gnuabin32";
   case GNUABI64: return "gnuabi64";
-  case GNUEABIHF: return "gnueabihf";
+  case GNUABIN32: return "gnuabin32";
   case GNUEABI: return "gnueabi";
+  case GNUEABIHF: return "gnueabihf";
   case GNUX32: return "gnux32";
-  case CODE16: return "code16";
-  case EABI: return "eabi";
-  case EABIHF: return "eabihf";
-  case Android: return "android";
+  case Itanium: return "itanium";
+  case MSVC: return "msvc";
+  case MacABI: return "macabi";
   case Musl: return "musl";
   case MuslEABI: return "musleabi";
   case MuslEABIHF: return "musleabihf";
-  case MSVC: return "msvc";
-  case Itanium: return "itanium";
-  case Cygnus: return "cygnus";
-  case CoreCLR: return "coreclr";
   case Simulator: return "simulator";
-  case MacABI: return "macabi";
   }
 
   llvm_unreachable("Invalid EnvironmentType!");
@@ -440,12 +440,12 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("spir64", Triple::spir64)
     .StartsWith("kalimba", Triple::kalimba)
     .Case("lanai", Triple::lanai)
-    .Case("shave", Triple::shave)
-    .Case("wasm32", Triple::wasm32)
-    .Case("wasm64", Triple::wasm64)
     .Case("renderscript32", Triple::renderscript32)
     .Case("renderscript64", Triple::renderscript64)
+    .Case("shave", Triple::shave)
     .Case("ve", Triple::ve)
+    .Case("wasm32", Triple::wasm32)
+    .Case("wasm64", Triple::wasm64)
     .Default(Triple::UnknownArch);
 
   // Some architectures require special parsing logic just to compute the
@@ -641,10 +641,10 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
 static StringRef getObjectFormatTypeName(Triple::ObjectFormatType Kind) {
   switch (Kind) {
   case Triple::UnknownObjectFormat: return "";
-  case Triple::COFF: return "coff";
-  case Triple::ELF: return "elf";
+  case Triple::COFF:  return "coff";
+  case Triple::ELF:   return "elf";
   case Triple::MachO: return "macho";
-  case Triple::Wasm: return "wasm";
+  case Triple::Wasm:  return "wasm";
   case Triple::XCOFF: return "xcoff";
   }
   llvm_unreachable("unknown object format type");
@@ -666,28 +666,28 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
     return Triple::ELF;
 
   case Triple::aarch64_be:
-  case Triple::arc:
   case Triple::amdgcn:
-  case Triple::amdil:
   case Triple::amdil64:
+  case Triple::amdil:
+  case Triple::arc:
   case Triple::armeb:
   case Triple::avr:
   case Triple::bpfeb:
   case Triple::bpfel:
   case Triple::hexagon:
-  case Triple::lanai:
-  case Triple::hsail:
   case Triple::hsail64:
+  case Triple::hsail:
   case Triple::kalimba:
+  case Triple::lanai:
   case Triple::le32:
   case Triple::le64:
-  case Triple::mips:
   case Triple::mips64:
   case Triple::mips64el:
+  case Triple::mips:
   case Triple::mipsel:
   case Triple::msp430:
-  case Triple::nvptx:
   case Triple::nvptx64:
+  case Triple::nvptx:
   case Triple::ppc64le:
   case Triple::r600:
   case Triple::renderscript32:
@@ -698,18 +698,18 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
   case Triple::sparc:
   case Triple::sparcel:
   case Triple::sparcv9:
-  case Triple::spir:
   case Triple::spir64:
+  case Triple::spir:
   case Triple::systemz:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumbeb:
-  case Triple::xcore:
   case Triple::ve:
+  case Triple::xcore:
     return Triple::ELF;
 
-  case Triple::ppc:
   case Triple::ppc64:
+  case Triple::ppc:
     if (T.isOSDarwin())
       return Triple::MachO;
     else if (T.isOSAIX())
@@ -1240,56 +1240,56 @@ static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch) {
     return 16;
 
   case llvm::Triple::aarch64_32:
+  case llvm::Triple::amdil:
   case llvm::Triple::arc:
   case llvm::Triple::arm:
   case llvm::Triple::armeb:
   case llvm::Triple::hexagon:
+  case llvm::Triple::hsail:
+  case llvm::Triple::kalimba:
+  case llvm::Triple::lanai:
   case llvm::Triple::le32:
   case llvm::Triple::mips:
   case llvm::Triple::mipsel:
   case llvm::Triple::nvptx:
   case llvm::Triple::ppc:
   case llvm::Triple::r600:
+  case llvm::Triple::renderscript32:
   case llvm::Triple::riscv32:
+  case llvm::Triple::shave:
   case llvm::Triple::sparc:
   case llvm::Triple::sparcel:
+  case llvm::Triple::spir:
   case llvm::Triple::tce:
   case llvm::Triple::tcele:
   case llvm::Triple::thumb:
   case llvm::Triple::thumbeb:
+  case llvm::Triple::wasm32:
   case llvm::Triple::x86:
   case llvm::Triple::xcore:
-  case llvm::Triple::amdil:
-  case llvm::Triple::hsail:
-  case llvm::Triple::spir:
-  case llvm::Triple::kalimba:
-  case llvm::Triple::lanai:
-  case llvm::Triple::shave:
-  case llvm::Triple::wasm32:
-  case llvm::Triple::renderscript32:
     return 32;
 
   case llvm::Triple::aarch64:
   case llvm::Triple::aarch64_be:
   case llvm::Triple::amdgcn:
-  case llvm::Triple::bpfel:
+  case llvm::Triple::amdil64:
   case llvm::Triple::bpfeb:
+  case llvm::Triple::bpfel:
+  case llvm::Triple::hsail64:
   case llvm::Triple::le64:
   case llvm::Triple::mips64:
   case llvm::Triple::mips64el:
   case llvm::Triple::nvptx64:
   case llvm::Triple::ppc64:
   case llvm::Triple::ppc64le:
+  case llvm::Triple::renderscript64:
   case llvm::Triple::riscv64:
   case llvm::Triple::sparcv9:
-  case llvm::Triple::systemz:
-  case llvm::Triple::x86_64:
-  case llvm::Triple::amdil64:
-  case llvm::Triple::hsail64:
   case llvm::Triple::spir64:
-  case llvm::Triple::wasm64:
-  case llvm::Triple::renderscript64:
+  case llvm::Triple::systemz:
   case llvm::Triple::ve:
+  case llvm::Triple::wasm64:
+  case llvm::Triple::x86_64:
     return 64;
   }
   llvm_unreachable("Invalid architecture value");
@@ -1313,61 +1313,61 @@ Triple Triple::get32BitArchVariant() const {
   case Triple::UnknownArch:
   case Triple::amdgcn:
   case Triple::avr:
-  case Triple::bpfel:
   case Triple::bpfeb:
+  case Triple::bpfel:
   case Triple::msp430:
-  case Triple::systemz:
   case Triple::ppc64le:
+  case Triple::systemz:
   case Triple::ve:
     T.setArch(UnknownArch);
     break;
 
   case Triple::aarch64_32:
   case Triple::amdil:
-  case Triple::hsail:
-  case Triple::spir:
   case Triple::arc:
   case Triple::arm:
   case Triple::armeb:
   case Triple::hexagon:
+  case Triple::hsail:
   case Triple::kalimba:
+  case Triple::lanai:
   case Triple::le32:
   case Triple::mips:
   case Triple::mipsel:
   case Triple::nvptx:
   case Triple::ppc:
   case Triple::r600:
+  case Triple::renderscript32:
   case Triple::riscv32:
+  case Triple::shave:
   case Triple::sparc:
   case Triple::sparcel:
+  case Triple::spir:
   case Triple::tce:
   case Triple::tcele:
   case Triple::thumb:
   case Triple::thumbeb:
+  case Triple::wasm32:
   case Triple::x86:
   case Triple::xcore:
-  case Triple::lanai:
-  case Triple::shave:
-  case Triple::wasm32:
-  case Triple::renderscript32:
     // Already 32-bit.
     break;
 
   case Triple::aarch64:        T.setArch(Triple::arm);     break;
   case Triple::aarch64_be:     T.setArch(Triple::armeb);   break;
+  case Triple::amdil64:        T.setArch(Triple::amdil);   break;
+  case Triple::hsail64:        T.setArch(Triple::hsail);   break;
   case Triple::le64:           T.setArch(Triple::le32);    break;
   case Triple::mips64:         T.setArch(Triple::mips);    break;
   case Triple::mips64el:       T.setArch(Triple::mipsel);  break;
   case Triple::nvptx64:        T.setArch(Triple::nvptx);   break;
   case Triple::ppc64:          T.setArch(Triple::ppc);     break;
-  case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
+  case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
   case Triple::riscv64:        T.setArch(Triple::riscv32); break;
-  case Triple::x86_64:         T.setArch(Triple::x86);     break;
-  case Triple::amdil64:        T.setArch(Triple::amdil);   break;
-  case Triple::hsail64:        T.setArch(Triple::hsail);   break;
+  case Triple::sparcv9:        T.setArch(Triple::sparc);   break;
   case Triple::spir64:         T.setArch(Triple::spir);    break;
   case Triple::wasm64:         T.setArch(Triple::wasm32);  break;
-  case Triple::renderscript64: T.setArch(Triple::renderscript32); break;
+  case Triple::x86_64:         T.setArch(Triple::x86);     break;
   }
   return T;
 }
@@ -1383,56 +1383,56 @@ Triple Triple::get64BitArchVariant() const {
   case Triple::lanai:
   case Triple::msp430:
   case Triple::r600:
+  case Triple::shave:
+  case Triple::sparcel:
   case Triple::tce:
   case Triple::tcele:
   case Triple::xcore:
-  case Triple::sparcel:
-  case Triple::shave:
     T.setArch(UnknownArch);
     break;
 
   case Triple::aarch64:
   case Triple::aarch64_be:
-  case Triple::bpfel:
-  case Triple::bpfeb:
-  case Triple::le64:
-  case Triple::amdil64:
   case Triple::amdgcn:
+  case Triple::amdil64:
+  case Triple::bpfeb:
+  case Triple::bpfel:
   case Triple::hsail64:
-  case Triple::spir64:
+  case Triple::le64:
   case Triple::mips64:
   case Triple::mips64el:
   case Triple::nvptx64:
   case Triple::ppc64:
   case Triple::ppc64le:
+  case Triple::renderscript64:
   case Triple::riscv64:
   case Triple::sparcv9:
+  case Triple::spir64:
   case Triple::systemz:
-  case Triple::x86_64:
-  case Triple::wasm64:
-  case Triple::renderscript64:
   case Triple::ve:
+  case Triple::wasm64:
+  case Triple::x86_64:
     // Already 64-bit.
     break;
 
   case Triple::aarch64_32:      T.setArch(Triple::aarch64);    break;
+  case Triple::amdil:           T.setArch(Triple::amdil64);    break;
   case Triple::arm:             T.setArch(Triple::aarch64);    break;
   case Triple::armeb:           T.setArch(Triple::aarch64_be); break;
+  case Triple::hsail:           T.setArch(Triple::hsail64);    break;
   case Triple::le32:            T.setArch(Triple::le64);       break;
   case Triple::mips:            T.setArch(Triple::mips64);     break;
   case Triple::mipsel:          T.setArch(Triple::mips64el);   break;
   case Triple::nvptx:           T.setArch(Triple::nvptx64);    break;
   case Triple::ppc:             T.setArch(Triple::ppc64);      break;
-  case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
+  case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
   case Triple::riscv32:         T.setArch(Triple::riscv64);    break;
-  case Triple::x86:             T.setArch(Triple::x86_64);     break;
-  case Triple::amdil:           T.setArch(Triple::amdil64);    break;
-  case Triple::hsail:           T.setArch(Triple::hsail64);    break;
+  case Triple::sparc:           T.setArch(Triple::sparcv9);    break;
   case Triple::spir:            T.setArch(Triple::spir64);     break;
   case Triple::thumb:           T.setArch(Triple::aarch64);    break;
   case Triple::thumbeb:         T.setArch(Triple::aarch64_be); break;
   case Triple::wasm32:          T.setArch(Triple::wasm64);     break;
-  case Triple::renderscript32:  T.setArch(Triple::renderscript64);     break;
+  case Triple::x86:             T.setArch(Triple::x86_64);     break;
   }
   return T;
 }
@@ -1458,6 +1458,8 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::nvptx64:
   case Triple::nvptx:
   case Triple::r600:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
   case Triple::shave:
@@ -1468,8 +1470,6 @@ Triple Triple::getBigEndianArchVariant() const {
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
-  case Triple::renderscript32:
-  case Triple::renderscript64:
   case Triple::ve:
 
   // ARM is intentionally unsupported here, changing the architecture would
@@ -1479,13 +1479,13 @@ Triple Triple::getBigEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
-  case Triple::tcele:   T.setArch(Triple::tce);        break;
   case Triple::aarch64: T.setArch(Triple::aarch64_be); break;
   case Triple::bpfel:   T.setArch(Triple::bpfeb);      break;
   case Triple::mips64el:T.setArch(Triple::mips64);     break;
   case Triple::mipsel:  T.setArch(Triple::mips);       break;
   case Triple::ppc64le: T.setArch(Triple::ppc64);      break;
   case Triple::sparcel: T.setArch(Triple::sparc);      break;
+  case Triple::tcele:   T.setArch(Triple::tce);        break;
   default:
     llvm_unreachable("getBigEndianArchVariant: unknown triple.");
   }
@@ -1511,13 +1511,13 @@ Triple Triple::getLittleEndianArchVariant() const {
     T.setArch(UnknownArch);
     break;
 
-  case Triple::tce:        T.setArch(Triple::tcele);    break;
   case Triple::aarch64_be: T.setArch(Triple::aarch64);  break;
   case Triple::bpfeb:      T.setArch(Triple::bpfel);    break;
   case Triple::mips64:     T.setArch(Triple::mips64el); break;
   case Triple::mips:       T.setArch(Triple::mipsel);   break;
   case Triple::ppc64:      T.setArch(Triple::ppc64le);  break;
   case Triple::sparc:      T.setArch(Triple::sparcel);  break;
+  case Triple::tce:        T.setArch(Triple::tcele);    break;
   default:
     llvm_unreachable("getLittleEndianArchVariant: unknown triple.");
   }
@@ -1547,22 +1547,22 @@ bool Triple::isLittleEndian() const {
   case Triple::nvptx:
   case Triple::ppc64le:
   case Triple::r600:
+  case Triple::renderscript32:
+  case Triple::renderscript64:
   case Triple::riscv32:
   case Triple::riscv64:
   case Triple::shave:
   case Triple::sparcel:
   case Triple::spir64:
   case Triple::spir:
+  case Triple::tcele:
   case Triple::thumb:
+  case Triple::ve:
   case Triple::wasm32:
   case Triple::wasm64:
   case Triple::x86:
   case Triple::x86_64:
   case Triple::xcore:
-  case Triple::tcele:
-  case Triple::renderscript32:
-  case Triple::renderscript64:
-  case Triple::ve:
     return true;
   default:
     return false;
@@ -1617,10 +1617,10 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   case llvm::Triple::Win32:
     // FIXME: this is invalid for WindowsCE
     return "cortex-a9";
-  case llvm::Triple::MacOSX:
   case llvm::Triple::IOS:
-  case llvm::Triple::WatchOS:
+  case llvm::Triple::MacOSX:
   case llvm::Triple::TvOS:
+  case llvm::Triple::WatchOS:
     if (MArch == "v7k")
       return "cortex-a7";
     break;
@@ -1640,10 +1640,10 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   switch (getOS()) {
   case llvm::Triple::NetBSD:
     switch (getEnvironment()) {
-    case llvm::Triple::GNUEABIHF:
-    case llvm::Triple::GNUEABI:
-    case llvm::Triple::EABIHF:
     case llvm::Triple::EABI:
+    case llvm::Triple::EABIHF:
+    case llvm::Triple::GNUEABI:
+    case llvm::Triple::GNUEABIHF:
       return "arm926ej-s";
     default:
       return "strongarm";