diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 8b6656834cc06..64f371e9f8db8 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -188,9 +188,6 @@ jobs:
       with:
         ref: ${{ needs.prepare.outputs.ref }}
 
-    - name: Install Ninja
-      uses: llvm/actions/install-ninja@5dd955034a6742a2e21d82bf165fcb1050ae7b49 # main
-
     - name: Set Build Prefix
       id: setup-stage
       shell: bash
diff --git a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
index 7afe97aac57e8..bf87654979cc9 100644
--- a/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
+++ b/flang/include/flang/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.h
@@ -16,7 +16,9 @@
 #include "mlir/Dialect/OpenACC/OpenACC.h"
 
 namespace fir {
+class AddrOfOp;
 class DeclareOp;
+class GlobalOp;
 } // namespace fir
 
 namespace hlfir {
@@ -53,6 +55,18 @@ struct PartialEntityAccessModel<hlfir::DeclareOp>
   bool isCompleteView(mlir::Operation *op) const;
 };
 
+struct AddressOfGlobalModel
+    : public mlir::acc::AddressOfGlobalOpInterface::ExternalModel<
+          AddressOfGlobalModel, fir::AddrOfOp> {
+  mlir::SymbolRefAttr getSymbol(mlir::Operation *op) const;
+};
+
+struct GlobalVariableModel
+    : public mlir::acc::GlobalVariableOpInterface::ExternalModel<
+          GlobalVariableModel, fir::GlobalOp> {
+  bool isConstant(mlir::Operation *op) const;
+};
+
 } // namespace fir::acc
 
 #endif // FLANG_OPTIMIZER_OPENACC_FIROPENACC_OPS_INTERFACES_H_
diff --git a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
index c1734be5185f4..11fbaf2dc2bb8 100644
--- a/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/FIROpenACCOpsInterfaces.cpp
@@ -59,4 +59,13 @@ bool PartialEntityAccessModel<hlfir::DeclareOp>::isCompleteView(
   return !getBaseEntity(op);
 }
 
+mlir::SymbolRefAttr AddressOfGlobalModel::getSymbol(mlir::Operation *op) const {
+  return mlir::cast<fir::AddrOfOp>(op).getSymbolAttr();
+}
+
+bool GlobalVariableModel::isConstant(mlir::Operation *op) const {
+  auto globalOp = mlir::cast<fir::GlobalOp>(op);
+  return globalOp.getConstant().has_value();
+}
+
 } // namespace fir::acc
diff --git a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
index d71c40dfac03c..5c7f9985d41ca 100644
--- a/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
+++ b/flang/lib/Optimizer/OpenACC/Support/RegisterOpenACCExtensions.cpp
@@ -49,6 +49,9 @@ void registerOpenACCExtensions(mlir::DialectRegistry &registry) {
         PartialEntityAccessModel<fir::CoordinateOp>>(*ctx);
     fir::DeclareOp::attachInterface<PartialEntityAccessModel<fir::DeclareOp>>(
         *ctx);
+
+    fir::AddrOfOp::attachInterface<AddressOfGlobalModel>(*ctx);
+    fir::GlobalOp::attachInterface<GlobalVariableModel>(*ctx);
   });
 
   // Register HLFIR operation interfaces
diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index f2b168f6cb0e3..5f7fb00889655 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA
 Membar/Fences
 -------------
 
+'``llvm.nvvm.fence.acquire/release.sync_restrict.*``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster()
+  declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster()
+
+Overview:
+"""""""""
+
+The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory
+operations for which the fence instruction provides the memory ordering guarantees.
+When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must
+be `release` and the effect of the fence operation only applies to operations
+performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is
+restricted to `shared_cluster`, then memory semantics must be `acquire` and the
+effect of the fence operation only applies to operations performed on objects in
+`shared_cluster` memory space. The scope for both operations is `cluster`. For more details,
+please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
+
+'``llvm.nvvm.fence.mbarrier_init.release.cluster``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.fence.mbarrier_init.release.cluster()
+
+Overview:
+"""""""""
+
+`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of
+memory operations for which the fence instruction provides the memory ordering
+guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to
+the prior `mbarrier_init` operation executed by the same thread on mbarrier objects
+in `shared_cta` memory space. For more details, please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
+
+'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster()
+  declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster()
+
+Overview:
+"""""""""
+
+`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish
+ordering between a prior memory access performed via the `async proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>__`
+and a subsequent memory access performed via the generic proxy.
+``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release
+sequence that synchronizes with an acquire sequence that contains the
+``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When
+`.sync_restrict` is restricted to `shared_cta`, then memory semantics must
+be `release` and the effect of the fence operation only applies to operations
+performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is
+restricted to `shared_cluster`, then memory semantics must be `acquire` and the
+effect of the fence operation only applies to operations performed on objects in
+`shared_cluster` memory space. The scope for both operations is `cluster`.
+For more details, please refer the `PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
+
+'``llvm.nvvm.fence.proxy.<proxykind>``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.fence.proxy.alias()
+  declare void @llvm.nvvm.fence.proxy.async()
+  declare void @llvm.nvvm.fence.proxy.async.global()
+  declare void @llvm.nvvm.fence.proxy.async.shared_cluster()
+  declare void @llvm.nvvm.fence.proxy.async.shared_cta()
+
+Overview:
+"""""""""
+
+`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional
+proxy ordering that is established between the memory accesses done between the
+`generic proxy<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#proxies>__`
+and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between
+two proxykinds establishes two `uni-directional` proxy orderings: one from the
+first proxykind to the second proxykind and the other from the second proxykind
+to the first proxykind.
+
+`alias` proxykind refers to memory accesses performed using virtually aliased
+addresses to the same memory location
+
+`async` proxykind specifies that the memory ordering is established between the
+`async proxy` and the `generic proxy`. The memory ordering is limited only to
+operations performed on objects in the state space specified (`generic`, `global`,
+`shared_cluster`, `shared_cta`). If no state space is specified, then the memory
+ordering applies on all state spaces. For more details, please refer the
+`PTX ISA <https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-membar>`__
+
 '``llvm.nvvm.fence.proxy.tensormap_generic.*``'
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0f17312b03827..a65e4667ab76c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -123,6 +123,32 @@ struct HardwareLoopInfo {
   LLVM_ABI bool canAnalyze(LoopInfo &LI);
 };
 
+/// Information for memory intrinsic cost model.
+class MemIntrinsicCostAttributes {
+  /// Vector type of the data to be loaded or stored.
+  Type *DataTy = nullptr;
+
+  /// ID of the memory intrinsic.
+  Intrinsic::ID IID;
+
+  /// Address space of the pointer.
+  unsigned AddressSpace = 0;
+
+  /// Alignment of single element.
+  Align Alignment;
+
+public:
+  LLVM_ABI MemIntrinsicCostAttributes(Intrinsic::ID Id, Type *DataTy,
+                                      Align Alignment, unsigned AddressSpace)
+      : DataTy(DataTy), IID(Id), AddressSpace(AddressSpace),
+        Alignment(Alignment) {}
+
+  Intrinsic::ID getID() const { return IID; }
+  Type *getDataType() const { return DataTy; }
+  unsigned getAddressSpace() const { return AddressSpace; }
+  Align getAlignment() const { return Alignment; }
+};
+
 class IntrinsicCostAttributes {
   const IntrinsicInst *II = nullptr;
   Type *RetTy = nullptr;
@@ -1556,7 +1582,7 @@ class TargetTransformInfo {
 
   /// \return The cost of masked Load and Store instructions.
   LLVM_ABI InstructionCost getMaskedMemoryOpCost(
-      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      const MemIntrinsicCostAttributes &MICA,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
   /// \return The cost of Gather or Scatter operation
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index aacb88d2f9684..d8e35748f53e5 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -842,8 +842,7 @@ class TargetTransformInfoImplBase {
   }
 
   virtual InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const {
     return 1;
   }
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 944e1714e8f98..cb389ae74ef46 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1558,9 +1558,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
   }
 
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *DataTy, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override {
+    Type *DataTy = MICA.getDataType();
+    Align Alignment = MICA.getAlignment();
+    unsigned Opcode = MICA.getID() == Intrinsic::masked_load
+                          ? Instruction::Load
+                          : Instruction::Store;
     // TODO: Pass on AddressSpace when we have test coverage.
     return getCommonMaskedMemoryOpCost(Opcode, DataTy, Alignment, true, false,
                                        CostKind);
@@ -1617,10 +1621,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 
     // Firstly, the cost of load/store operation.
     InstructionCost Cost;
-    if (UseMaskForCond || UseMaskForGaps)
-      Cost = thisT()->getMaskedMemoryOpCost(Opcode, VecTy, Alignment,
-                                            AddressSpace, CostKind);
-    else
+    if (UseMaskForCond || UseMaskForGaps) {
+      unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
+                                                 : Intrinsic::masked_store;
+      Cost = thisT()->getMaskedMemoryOpCost(
+          {IID, VecTy, Alignment, AddressSpace}, CostKind);
+    } else
       Cost = thisT()->getMemoryOpCost(Opcode, VecTy, Alignment, AddressSpace,
                                       CostKind);
 
@@ -2403,14 +2409,12 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     case Intrinsic::masked_store: {
       Type *Ty = Tys[0];
       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
-      return thisT()->getMaskedMemoryOpCost(Instruction::Store, Ty, TyAlign, 0,
-                                            CostKind);
+      return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind);
     }
     case Intrinsic::masked_load: {
       Type *Ty = RetTy;
       Align TyAlign = thisT()->DL.getABITypeAlign(Ty);
-      return thisT()->getMaskedMemoryOpCost(Instruction::Load, Ty, TyAlign, 0,
-                                            CostKind);
+      return thisT()->getMaskedMemoryOpCost({IID, Ty, TyAlign, 0}, CostKind);
     }
     case Intrinsic::experimental_vp_strided_store: {
       auto *Ty = cast<VectorType>(ICA.getArgTypes()[0]);
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 1b485dc8ccd1e..c8e776e6499df 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1746,33 +1746,65 @@ let TargetPrefix = "nvvm" in {
     def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>;
   }
 
-  //
-  // Membar
-  //
-  let IntrProperties = [IntrNoCallback] in {
+//
+// Membar / Fence
+//
+let IntrProperties = [IntrNoCallback] in {
     def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>;
     def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>;
     def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>;
     def int_nvvm_fence_sc_cluster : Intrinsic<[]>;
-  }
 
-  //
-  // Proxy fence (uni-directional)
-  //
+    // Operation fence
+    def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [],
+            "llvm.nvvm.fence.mbarrier_init.release.cluster">;
+
+    // Thread fence
+    def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster :
+          Intrinsic<[], [], [],
+            "llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">;
+
+    def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster :
+          Intrinsic<[], [], [],
+            "llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">;
+
+//
+// Proxy fence (uni-directional)
+//
+
+  def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster :
+        Intrinsic<[], [], [],
+          "llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">;
+
+  def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster :
+        Intrinsic<[], [], [],
+          "llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">;
+
   foreach scope = ["cta", "cluster", "gpu", "sys"] in {
 
     def int_nvvm_fence_proxy_tensormap_generic_release_ # scope :
-          Intrinsic<[], [], [IntrNoCallback],
+          Intrinsic<[], [], [],
           "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>;
 
     // The imm-arg 'size' can only be 128.
     def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope :
-          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty],
-                    [IntrNoCallback, IntrArgMemOnly, ImmArg<ArgIndex<1>>,
-                    Range<ArgIndex<1>, 128, 129>],
-                    "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>;
+          Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [],
+                    "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> {
+      let IntrProperties = [IntrNoCallback, IntrArgMemOnly,
+                            ImmArg<ArgIndex<1>>, Range<ArgIndex<1>, 128, 129>];
+    }
   }
 
+//
+// Proxy fence (bi-directional)
+//
+  foreach proxykind = ["alias", "async", "async.global", "async.shared_cta",
+                        "async.shared_cluster"] in {
+    defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>;
+    def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>;
+  }
+}
+
 //
 // Async Copy
 //
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index db99885121ec1..6abde996e6dc8 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -1579,6 +1579,99 @@ def CONVERGENCECTRL_GLUE : StandardPseudoInstruction {
 }
 }
 
+/// Allow a target to replace the instruction definition of a
+/// StandardPseudoInstruction. A target should only define one
+/// instance of this per instruction.
+///
+/// This is intended to allow targets to specify the register class
+/// used for pointers. It should not be used to change the fundamental
+/// operand structure (e.g., this should not add or remove operands,
+/// or change the operand types).
+class TargetSpecializedStandardPseudoInstruction<
+  StandardPseudoInstruction base_inst> : Instruction {
+
+  StandardPseudoInstruction Instruction = base_inst;
+  let OutOperandList = base_inst.OutOperandList;
+  let InOperandList = base_inst.InOperandList;
+
+  // TODO: Copy everything
+  let usesCustomInserter = base_inst.usesCustomInserter;
+  let hasSideEffects = base_inst.hasSideEffects;
+  let mayLoad = base_inst.mayLoad;
+  let mayStore = base_inst.mayStore;
+  let isTerminator = base_inst.isTerminator;
+  let isBranch = base_inst.isBranch;
+  let isIndirectBranch = base_inst.isIndirectBranch;
+  let isEHScopeReturn = base_inst.isEHScopeReturn;
+  let isReturn = base_inst.isReturn;
+  let isCall = base_inst.isCall;
+  let hasCtrlDep = base_inst.hasCtrlDep;
+  let isReMaterializable = base_inst.isReMaterializable;
+  let isMeta = base_inst.isMeta;
+  let Size = base_inst.Size;
+  let isAsCheapAsAMove = base_inst.isAsCheapAsAMove;
+  let isPseudo = true;
+  let hasNoSchedulingInfo = true;
+  let isNotDuplicable = base_inst.isNotDuplicable;
+  let isConvergent = base_inst.isConvergent;
+  let hasExtraSrcRegAllocReq = base_inst.hasExtraSrcRegAllocReq;
+  let hasExtraDefRegAllocReq = base_inst.hasExtraDefRegAllocReq;
+}
+
+// All pseudo instructions which need a pointer register class, which
+// should be specialized by a target.
+defvar PseudosWithPtrOps = [
+  LOAD_STACK_GUARD,
+  PREALLOCATED_ARG,
+  PATCHABLE_EVENT_CALL,
+  PATCHABLE_TYPED_EVENT_CALL
+];
+
+
+/// Replace PointerLikeRegClass operands in OperandList with new_rc.
+class RemapPointerOperandList<dag OperandList, RegisterClassLike new_rc> {
+  // Collect the set of names so we can query and rewrite them.
+  list<string> op_names = !foreach(i, !range(!size(OperandList)),
+                                      !getdagname(OperandList, i));
+
+  // Beautiful language. This would be a lot easier if !getdagarg
+  // didn't require a specific type. We can't just collect a list of
+  // the operand values and reconstruct the dag, since there isn't a
+  // common base class for all the field kinds used in
+  // pseudoinstruction definitions; therefore everything must be
+  // maintained as a dag, so use a foldl. Additionally, ? doesn't
+  // evaluate as false so we get even more noise.
+  dag ret =
+    !foldl(OperandList, op_names, acc, name,
+      !cond(
+        !initialized(!getdagarg<PointerLikeRegClass>(OperandList, name))
+          : !setdagarg(acc, name, new_rc),
+        !initialized(!getdagarg<unknown_class>(OperandList, name)) : acc,
+        !initialized(!getdagarg<DAGOperand>(OperandList, name)) : acc
+      )
+    );
+}
+
+/// Define an override for a pseudoinstruction which uses a pointer
+/// register class, specialized to the target's pointer type.
+class RemapPointerOperands<StandardPseudoInstruction inst,
+                           RegisterClassLike new_rc> :
+  TargetSpecializedStandardPseudoInstruction<inst> {
+  let OutOperandList =
+    RemapPointerOperandList<inst.OutOperandList, new_rc>.ret;
+  let InOperandList =
+    RemapPointerOperandList<inst.InOperandList, new_rc>.ret;
+}
+
+/// Helper to replace all pseudoinstructions using pointers to a
+/// target register class. Most targets should use this.
+multiclass RemapAllTargetPseudoPointerOperands<
+  RegisterClassLike default_ptr_rc> {
+  foreach inst = PseudosWithPtrOps in {
+    def : RemapPointerOperands<inst, default_ptr_rc>;
+  }
+}
+
 // Generic opcodes used in GlobalISel.
 include "llvm/Target/GenericOpcodes.td"
 
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 0426ac7e62fab..45369f0ffe137 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1183,10 +1183,9 @@ InstructionCost TargetTransformInfo::getMemoryOpCost(
 }
 
 InstructionCost TargetTransformInfo::getMaskedMemoryOpCost(
-    unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+    const MemIntrinsicCostAttributes &MICA,
     TTI::TargetCostKind CostKind) const {
-  InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment,
-                                                        AddressSpace, CostKind);
+  InstructionCost Cost = TTIImpl->getMaskedMemoryOpCost(MICA, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 9dfb6af58323a..1099faca9fa46 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -459,6 +459,15 @@ void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
   addBlock(Die, Attribute, Block->BestForm(), Block);
 }
 
+void DwarfUnit::addBlock(DIE &Die, dwarf::Attribute Attribute,
+                         const DIExpression *Expr) {
+  DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+  DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+  DwarfExpr.setMemoryLocationKind();
+  DwarfExpr.addExpression(Expr);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
+}
+
 void DwarfUnit::addSourceLine(DIE &Die, unsigned Line, unsigned Column,
                               const DIFile *File) {
   if (Line == 0)
@@ -842,27 +851,14 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIStringType *STy) {
     if (auto *VarDIE = getDIE(Var))
       addDIEEntry(Buffer, dwarf::DW_AT_string_length, *VarDIE);
   } else if (DIExpression *Expr = STy->getStringLengthExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    // This is to describe the memory location of the
-    // length of a Fortran deferred length string, so
-    // lock it down as such.
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(Expr);
-    addBlock(Buffer, dwarf::DW_AT_string_length, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_string_length, Expr);
   } else {
     uint64_t Size = STy->getSizeInBits() >> 3;
     addUInt(Buffer, dwarf::DW_AT_byte_size, std::nullopt, Size);
   }
 
   if (DIExpression *Expr = STy->getStringLocationExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    // This is to describe the memory location of the
-    // string, so lock it down as such.
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(Expr);
-    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_data_location, Expr);
   }
 
   if (STy->getEncoding()) {
@@ -1618,11 +1614,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &DW_Subrange, const DISubrangeType *SR,
       if (auto *VarDIE = getDIE(BV))
         addDIEEntry(DW_Subrange, Attr, *VarDIE);
     } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) {
-      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-      DwarfExpr.setMemoryLocationKind();
-      DwarfExpr.addExpression(BE);
-      addBlock(DW_Subrange, Attr, DwarfExpr.finalize());
+      addBlock(DW_Subrange, Attr, BE);
     } else if (auto *BI = dyn_cast_if_present<ConstantInt *>(Bound)) {
       if (Attr == dwarf::DW_AT_GNU_bias) {
         if (BI->getSExtValue() != 0)
@@ -1660,11 +1652,7 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR) {
       if (auto *VarDIE = getDIE(BV))
         addDIEEntry(DW_Subrange, Attr, *VarDIE);
     } else if (auto *BE = dyn_cast_if_present<DIExpression *>(Bound)) {
-      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-      DwarfExpr.setMemoryLocationKind();
-      DwarfExpr.addExpression(BE);
-      addBlock(DW_Subrange, Attr, DwarfExpr.finalize());
+      addBlock(DW_Subrange, Attr, BE);
     } else if (auto *BI = dyn_cast_if_present<ConstantInt *>(Bound)) {
       if (Attr == dwarf::DW_AT_count) {
         if (BI->getSExtValue() != -1)
@@ -1710,11 +1698,7 @@ void DwarfUnit::constructGenericSubrangeDIE(DIE &Buffer,
           addSInt(DwGenericSubrange, Attr, dwarf::DW_FORM_sdata,
                   BE->getElement(1));
       } else {
-        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-        DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-        DwarfExpr.setMemoryLocationKind();
-        DwarfExpr.addExpression(BE);
-        addBlock(DwGenericSubrange, Attr, DwarfExpr.finalize());
+        addBlock(DwGenericSubrange, Attr, BE);
       }
     }
   };
@@ -1781,44 +1765,28 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
     if (auto *VarDIE = getDIE(Var))
       addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE);
   } else if (DIExpression *Expr = CTy->getDataLocationExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(Expr);
-    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_data_location, Expr);
   }
 
   if (DIVariable *Var = CTy->getAssociated()) {
     if (auto *VarDIE = getDIE(Var))
       addDIEEntry(Buffer, dwarf::DW_AT_associated, *VarDIE);
   } else if (DIExpression *Expr = CTy->getAssociatedExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(Expr);
-    addBlock(Buffer, dwarf::DW_AT_associated, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_associated, Expr);
   }
 
   if (DIVariable *Var = CTy->getAllocated()) {
     if (auto *VarDIE = getDIE(Var))
       addDIEEntry(Buffer, dwarf::DW_AT_allocated, *VarDIE);
   } else if (DIExpression *Expr = CTy->getAllocatedExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(Expr);
-    addBlock(Buffer, dwarf::DW_AT_allocated, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_allocated, Expr);
   }
 
   if (auto *RankConst = CTy->getRankConst()) {
     addSInt(Buffer, dwarf::DW_AT_rank, dwarf::DW_FORM_sdata,
             RankConst->getSExtValue());
   } else if (auto *RankExpr = CTy->getRankExp()) {
-    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
-    DwarfExpr.setMemoryLocationKind();
-    DwarfExpr.addExpression(RankExpr);
-    addBlock(Buffer, dwarf::DW_AT_rank, DwarfExpr.finalize());
+    addBlock(Buffer, dwarf::DW_AT_rank, RankExpr);
   }
 
   if (auto *BitStride = CTy->getBitStrideConst()) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 7841ff7fa5952..6875c415057d2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -218,6 +218,9 @@ class DwarfUnit : public DIEUnit {
   void addBlock(DIE &Die, dwarf::Attribute Attribute, dwarf::Form Form,
                 DIEBlock *Block);
 
+  /// Add an expression as block data.
+  void addBlock(DIE &Die, dwarf::Attribute Attribute, const DIExpression *Expr);
+
   /// Add location information to specified debug information entry.
   void addSourceLine(DIE &Die, unsigned Line, unsigned Column,
                      const DIFile *File);
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index bf195ca210e9b..0bae00bafee3c 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4730,12 +4730,12 @@ bool AArch64TTIImpl::prefersVectorizedAddressing() const {
 }
 
 InstructionCost
-AArch64TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                      Align Alignment, unsigned AddressSpace,
+AArch64TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                       TTI::TargetCostKind CostKind) const {
+  Type *Src = MICA.getDataType();
+
   if (useNeonVector(Src))
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
+    return BaseT::getMaskedMemoryOpCost(MICA, CostKind);
   auto LT = getTypeLegalizationCost(Src);
   if (!LT.first.isValid())
     return InstructionCost::getInvalid();
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index d189f563f99a1..6cc4987428567 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -188,8 +188,7 @@ class AArch64TTIImpl final : public BasicTTIImplBase<AArch64TTIImpl> {
                                                   unsigned Opcode2) const;
 
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index ca98b80787fb4..a87f9f274a4d3 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -1423,7 +1423,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   /// \returns true if the target has packed f32 instructions that only read 32
   /// bits from a scalar operand (SGPR or literal) and replicates the bits to
   /// both channels.
-  bool hasPKF32InstsReplicatingLow32BitsOfScalarInput() const {
+  bool hasPKF32InstsReplicatingLower32BitsOfScalarInput() const {
     return getGeneration() == GFX12 && GFX1250Insts;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index cfc0c16ea30e5..8a959125de402 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -766,29 +766,21 @@ static void appendFoldCandidate(SmallVectorImpl<FoldCandidate> &FoldList,
                       FoldCandidate(MI, OpNo, FoldOp, Commuted, ShrinkOp));
 }
 
-// Returns true if the instruction is a packed f32 instruction that only reads
-// 32 bits from a scalar operand (SGPR or literal) and replicates the bits to
-// both channels.
-static bool
-isPKF32InstrReplicatingLow32BitsOfScalarInput(const GCNSubtarget *ST,
-                                              MachineInstr *MI) {
-  if (!ST->hasPKF32InstsReplicatingLow32BitsOfScalarInput())
+// Returns true if the instruction is a packed F32 instruction and the
+// corresponding scalar operand reads 32 bits and replicates the bits to both
+// channels.
+static bool isPKF32InstrReplicatesLower32BitsOfScalarOperand(
+    const GCNSubtarget *ST, MachineInstr *MI, unsigned OpNo) {
+  if (!ST->hasPKF32InstsReplicatingLower32BitsOfScalarInput())
     return false;
-  switch (MI->getOpcode()) {
-  case AMDGPU::V_PK_ADD_F32:
-  case AMDGPU::V_PK_MUL_F32:
-  case AMDGPU::V_PK_FMA_F32:
-    return true;
-  default:
-    return false;
-  }
-  llvm_unreachable("unknown instruction");
+  const MCOperandInfo &OpDesc = MI->getDesc().operands()[OpNo];
+  return OpDesc.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP32;
 }
 
 // Packed FP32 instructions only read 32 bits from a scalar operand (SGPR or
 // literal) and replicates the bits to both channels. Therefore, if the hi and
 // lo are not same, we can't fold it.
-static bool checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(
+static bool checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(
     const FoldableDef &OpToFold) {
   assert(OpToFold.isImm() && "Expected immediate operand");
   uint64_t ImmVal = OpToFold.getEffectiveImmVal().value();
@@ -953,8 +945,8 @@ bool SIFoldOperandsImpl::tryAddToFoldList(
   // Special case for PK_F32 instructions if we are trying to fold an imm to
   // src0 or src1.
   if (OpToFold.isImm() &&
-      isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, MI) &&
-      !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
+      isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, MI, OpNo) &&
+      !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
     return false;
 
   appendFoldCandidate(FoldList, MI, OpNo, OpToFold);
@@ -1171,8 +1163,8 @@ bool SIFoldOperandsImpl::tryToFoldACImm(
     return false;
 
   if (OpToFold.isImm() && OpToFold.isOperandLegal(*TII, *UseMI, UseOpIdx)) {
-    if (isPKF32InstrReplicatingLow32BitsOfScalarInput(ST, UseMI) &&
-        !checkImmOpForPKF32InstrReplicatingLow32BitsOfScalarInput(OpToFold))
+    if (isPKF32InstrReplicatesLower32BitsOfScalarOperand(ST, UseMI, UseOpIdx) &&
+        !checkImmOpForPKF32InstrReplicatesLower32BitsOfScalarOperand(OpToFold))
       return false;
     appendFoldCandidate(FoldList, UseMI, UseOpIdx, OpToFold);
     return true;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 24f58a68c345d..d12b802fe234f 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1631,20 +1631,22 @@ InstructionCost ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 InstructionCost
-ARMTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                                  unsigned AddressSpace,
+ARMTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                   TTI::TargetCostKind CostKind) const {
+  unsigned IID = MICA.getID();
+  Type *Src = MICA.getDataType();
+  Align Alignment = MICA.getAlignment();
+  unsigned AddressSpace = MICA.getAddressSpace();
   if (ST->hasMVEIntegerOps()) {
-    if (Opcode == Instruction::Load &&
+    if (IID == Intrinsic::masked_load &&
         isLegalMaskedLoad(Src, Alignment, AddressSpace))
       return ST->getMVEVectorCostFactor(CostKind);
-    if (Opcode == Instruction::Store &&
+    if (IID == Intrinsic::masked_store &&
         isLegalMaskedStore(Src, Alignment, AddressSpace))
       return ST->getMVEVectorCostFactor(CostKind);
   }
   if (!isa<FixedVectorType>(Src))
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
+    return BaseT::getMaskedMemoryOpCost(MICA, CostKind);
   // Scalar cost, which is currently very high due to the efficiency of the
   // generated code.
   return cast<FixedVectorType>(Src)->getNumElements() * 8;
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 0810c5532ed91..919a6fc9fd0b0 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -275,8 +275,7 @@ class ARMTTIImpl final : public BasicTTIImplBase<ARMTTIImpl> {
       const Instruction *I = nullptr) const override;
 
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override;
 
   InstructionCost getInterleavedMemoryOpCost(
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index e925e041eb64e..8f3f0cc8abb01 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -224,11 +224,9 @@ InstructionCost HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 InstructionCost
-HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                      Align Alignment, unsigned AddressSpace,
+HexagonTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                       TTI::TargetCostKind CostKind) const {
-  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                      CostKind);
+  return BaseT::getMaskedMemoryOpCost(MICA, CostKind);
 }
 
 InstructionCost
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index cec2bf9656ffc..e95b5a10b76a7 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -120,8 +120,7 @@ class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override;
   InstructionCost
   getShuffleCost(TTI::ShuffleKind Kind, VectorType *DstTy, VectorType *SrcTy,
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index bcdb46eca9744..cd7bc37942ca4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -364,7 +364,42 @@ def INT_FENCE_SC_CLUSTER:
        NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>,
        Requires<[hasPTX<78>, hasSM<90>]>;
 
+def INT_FENCE_MBARRIER_INIT_RELEASE_CLUSTER:
+       NullaryInst<"fence.mbarrier_init.release.cluster",
+        int_nvvm_fence_mbarrier_init_release_cluster>,
+       Requires<[hasPTX<80>, hasSM<90>]>;
+
+let Predicates = [hasPTX<86>, hasSM<90>] in {
+def INT_FENCE_ACQUIRE_SYNC_RESTRICT_CLUSTER_CLUSTER:
+       NullaryInst<"fence.acquire.sync_restrict::shared::cluster.cluster",
+        int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster>;
+
+def INT_FENCE_RELEASE_SYNC_RESTRICT_CTA_CLUSTER:
+       NullaryInst<"fence.release.sync_restrict::shared::cta.cluster",
+        int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster>;
+}
+
 // Proxy fence (uni-directional)
+let Predicates = [hasPTX<86>, hasSM<90>] in {
+def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_ACQUIRE_SYNC_RESTRICT_SPACE_CLUSTER_SCOPE_CLUSTER:
+       NullaryInst<"fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster",
+        int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster>;
+
+def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_RELEASE_SYNC_RESTRICT_SPACE_CTA_SCOPE_CLUSTER:
+       NullaryInst<"fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster",
+        int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster>;
+}
+
+// Proxy fence (bi-directional)
+foreach proxykind = ["alias", "async", "async.global", "async.shared_cta",
+                      "async.shared_cluster"] in {
+  defvar Preds = !if(!eq(proxykind, "alias"), [hasPTX<75>, hasSM<70>],
+                                              [hasPTX<80>, hasSM<90>]);
+  defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>;
+  def : NullaryInst<"fence.proxy." # !subst("_", "::", proxykind),
+          !cast<Intrinsic>(Intr.record_name)>, Requires<Preds>;
+}
+
 class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE<string Scope, Intrinsic Intr> :
         NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>,
         Requires<[hasPTX<83>, hasSM<90>]>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index dca6e9cffebb0..1a1a93a9cb178 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1008,13 +1008,17 @@ InstructionCost RISCVTTIImpl::getScalarizationOverhead(
 }
 
 InstructionCost
-RISCVTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                                    unsigned AddressSpace,
+RISCVTTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                     TTI::TargetCostKind CostKind) const {
+  unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
+                                                           : Instruction::Store;
+  Type *Src = MICA.getDataType();
+  Align Alignment = MICA.getAlignment();
+  unsigned AddressSpace = MICA.getAddressSpace();
+
   if (!isLegalMaskedLoadStore(Src, Alignment) ||
       CostKind != TTI::TCK_RecipThroughput)
-    return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
-                                        CostKind);
+    return BaseT::getMaskedMemoryOpCost(MICA, CostKind);
 
   return getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index 6886e8964e29e..39c1173e2986c 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -144,8 +144,7 @@ class RISCVTTIImpl final : public BasicTTIImplBase<RISCVTTIImpl> {
   bool shouldConsiderVectorizationRegPressure() const override { return true; }
 
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override;
 
   InstructionCost
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 0b1430e373fc7..4b77bf925b2ba 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -5411,9 +5411,14 @@ InstructionCost X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 InstructionCost
-X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy, Align Alignment,
-                                  unsigned AddressSpace,
+X86TTIImpl::getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                                   TTI::TargetCostKind CostKind) const {
+  unsigned Opcode = MICA.getID() == Intrinsic::masked_load ? Instruction::Load
+                                                           : Instruction::Store;
+  Type *SrcTy = MICA.getDataType();
+  Align Alignment = MICA.getAlignment();
+  unsigned AddressSpace = MICA.getAddressSpace();
+
   bool IsLoad = (Instruction::Load == Opcode);
   bool IsStore = (Instruction::Store == Opcode);
 
@@ -6647,10 +6652,12 @@ InstructionCost X86TTIImpl::getInterleavedMemoryOpCostAVX512(
                                              LegalVT.getVectorNumElements());
   InstructionCost MemOpCost;
   bool UseMaskedMemOp = UseMaskForCond || UseMaskForGaps;
-  if (UseMaskedMemOp)
-    MemOpCost = getMaskedMemoryOpCost(Opcode, SingleMemOpTy, Alignment,
-                                      AddressSpace, CostKind);
-  else
+  if (UseMaskedMemOp) {
+    unsigned IID = Opcode == Instruction::Load ? Intrinsic::masked_load
+                                               : Intrinsic::masked_store;
+    MemOpCost = getMaskedMemoryOpCost(
+        {IID, SingleMemOpTy, Alignment, AddressSpace}, CostKind);
+  } else
     MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace,
                                 CostKind);
 
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index de5e1c297b1e4..df1393ce16ca1 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -183,8 +183,7 @@ class X86TTIImpl final : public BasicTTIImplBase<X86TTIImpl> {
       TTI::OperandValueInfo OpInfo = {TTI::OK_AnyValue, TTI::OP_None},
       const Instruction *I = nullptr) const override;
   InstructionCost
-  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
-                        unsigned AddressSpace,
+  getMaskedMemoryOpCost(const MemIntrinsicCostAttributes &MICA,
                         TTI::TargetCostKind CostKind) const override;
   InstructionCost getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
                                          const Value *Ptr, bool VariableMask,
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 94663ff928a0b..fa35eef2c00f5 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1469,6 +1469,9 @@ void LowerTypeTestsModule::replaceWeakDeclarationWithJumpTablePtr(
                                      Constant::getNullValue(F->getType()));
     Value *Select = Builder.CreateSelect(ICmp, JT,
                                          Constant::getNullValue(F->getType()));
+
+    if (auto *SI = dyn_cast<SelectInst>(Select))
+      setExplicitlyUnknownBranchWeightsIfProfiled(*SI, DEBUG_TYPE);
     // For phi nodes, we need to update the incoming value for all operands
     // with the same predecessor.
     if (PN)
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c680b6fca84cd..aa52f9e2a53ca 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5251,8 +5251,10 @@ LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
   if (Legal->isMaskRequired(I)) {
-    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
-                                      CostKind);
+    unsigned IID = I->getOpcode() == Instruction::Load
+                       ? Intrinsic::masked_load
+                       : Intrinsic::masked_store;
+    Cost += TTI.getMaskedMemoryOpCost({IID, VectorTy, Alignment, AS}, CostKind);
   } else {
     TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(I->getOperand(0));
     Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index deb8ee2d88055..e33ff724ccdd5 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6904,9 +6904,10 @@ static bool isMaskedLoadCompress(
       ScalarLoadsCost;
   InstructionCost LoadCost = 0;
   if (IsMasked) {
-    LoadCost =
-        TTI.getMaskedMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
-                                  LI->getPointerAddressSpace(), CostKind);
+    LoadCost = TTI.getMaskedMemoryOpCost({Intrinsic::masked_load, LoadVecTy,
+                                          CommonAlignment,
+                                          LI->getPointerAddressSpace()},
+                                         CostKind);
   } else {
     LoadCost =
         TTI.getMemoryOpCost(Instruction::Load, LoadVecTy, CommonAlignment,
@@ -7305,8 +7306,9 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads(
           break;
         case LoadsState::CompressVectorize:
           VecLdCost += TTI.getMaskedMemoryOpCost(
-                           Instruction::Load, SubVecTy, CommonAlignment,
-                           LI0->getPointerAddressSpace(), CostKind) +
+                           {Intrinsic::masked_load, SubVecTy, CommonAlignment,
+                            LI0->getPointerAddressSpace()},
+                           CostKind) +
                        VectorGEPCost +
                        ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, SubVecTy,
                                         {}, CostKind);
@@ -15102,8 +15104,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
               CommonAlignment, LI0->getPointerAddressSpace(), CostKind);
         } else if (IsMasked) {
           VecLdCost = TTI->getMaskedMemoryOpCost(
-              Instruction::Load, LoadVecTy, CommonAlignment,
-              LI0->getPointerAddressSpace(), CostKind);
+              {Intrinsic::masked_load, LoadVecTy, CommonAlignment,
+               LI0->getPointerAddressSpace()},
+              CostKind);
           // TODO: include this cost into CommonCost.
           VecLdCost += ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc,
                                         LoadVecTy, CompressMask, CostKind);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 94657f5d39390..e89e91b959926 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -3592,8 +3592,10 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
 
   InstructionCost Cost = 0;
   if (IsMasked) {
+    unsigned IID = isa<VPWidenLoadRecipe>(this) ? Intrinsic::masked_load
+                                                : Intrinsic::masked_store;
     Cost +=
-        Ctx.TTI.getMaskedMemoryOpCost(Opcode, Ty, Alignment, AS, Ctx.CostKind);
+        Ctx.TTI.getMaskedMemoryOpCost({IID, Ty, Alignment, AS}, Ctx.CostKind);
   } else {
     TTI::OperandValueInfo OpInfo = Ctx.getOperandInfo(
         isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this) ? getOperand(0)
@@ -3711,8 +3713,10 @@ InstructionCost VPWidenLoadEVLRecipe::computeCost(ElementCount VF,
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
+  // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics.
+  // After migrating to getMemIntrinsicInstrCost, switch this to vp_load.
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Instruction::Load, Ty, Alignment, AS, Ctx.CostKind);
+      {Intrinsic::masked_load, Ty, Alignment, AS}, Ctx.CostKind);
   if (!Reverse)
     return Cost;
 
@@ -3820,8 +3824,10 @@ InstructionCost VPWidenStoreEVLRecipe::computeCost(ElementCount VF,
   Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
   unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
                     ->getAddressSpace();
+  // FIXME: getMaskedMemoryOpCost assumes masked_* intrinsics.
+  // After migrating to getMemIntrinsicInstrCost, switch this to vp_store.
   InstructionCost Cost = Ctx.TTI.getMaskedMemoryOpCost(
-      Instruction::Store, Ty, Alignment, AS, Ctx.CostKind);
+      {Intrinsic::masked_store, Ty, Alignment, AS}, Ctx.CostKind);
   if (!Reverse)
     return Cost;
 
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll
new file mode 100644
index 0000000000000..d46408e31752f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %}
+
+define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster()
+  ret void
+}
+
+define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster()
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll
new file mode 100644
index 0000000000000..896c624602a60
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll
@@ -0,0 +1,51 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+define void @test_nvvm_fence_proxy_async() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async()
+  ret void
+}
+
+define void @test_nvvm_fence_proxy_async_global() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_global(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async.global;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async.global()
+  ret void
+}
+
+define void @test_nvvm_fence_proxy_async_shared_cluster() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async.shared::cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async.shared_cluster()
+  ret void
+}
+
+define void @test_nvvm_fence_proxy_async_shared_cta() {
+; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.async.shared::cta;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.async.shared_cta()
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll
new file mode 100644
index 0000000000000..ab35e4fb396d6
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll
@@ -0,0 +1,8 @@
+; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s
+
+define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) {
+  ; CHECK: immarg value 130 out of range [128, 129)
+  call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130);
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll
new file mode 100644
index 0000000000000..cb5679e68944d
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll
@@ -0,0 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %}
+
+define void @test_nvvm_fence_proxy_alias() {
+; CHECK-LABEL: test_nvvm_fence_proxy_alias(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.proxy.alias;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.proxy.alias()
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll
new file mode 100644
index 0000000000000..629b702742afb
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/op-fence.ll
@@ -0,0 +1,17 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_mbarrier_init
+define void @test_fence_mbarrier_init() {
+; CHECK-LABEL: test_fence_mbarrier_init(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.mbarrier_init.release.cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.mbarrier_init.release.cluster();
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll
new file mode 100644
index 0000000000000..185461bd183d0
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s
+; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %}
+
+; CHECK-LABEL: test_fence_acquire
+define void @test_fence_acquire() {
+; CHECK-LABEL: test_fence_acquire(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.acquire.sync_restrict::shared::cluster.cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster();
+
+  ret void
+}
+
+; CHECK-LABEL: test_fence_release
+define void @test_fence_release() {
+; CHECK-LABEL: test_fence_release(
+; CHECK:       {
+; CHECK-EMPTY:
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    fence.release.sync_restrict::shared::cta.cluster;
+; CHECK-NEXT:    ret;
+  call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster();
+
+  ret void
+}
diff --git a/llvm/test/TableGen/target-specialized-pseudos.td b/llvm/test/TableGen/target-specialized-pseudos.td
new file mode 100644
index 0000000000000..99c63f3ec29d9
--- /dev/null
+++ b/llvm/test/TableGen/target-specialized-pseudos.td
@@ -0,0 +1,101 @@
+// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DONECASE -o - | FileCheck -check-prefixes=CHECK,ONECASE %s
+// RUN: llvm-tblgen -gen-instr-info -I %p/../../include %s -DALLCASES -o - | FileCheck -check-prefixes=CHECK,ALLCASES %s
+// RUN: not llvm-tblgen -gen-instr-info -I %p/../../include %s -DERROR -o /dev/null 2>&1 | FileCheck -check-prefix=ERROR %s
+
+// CHECK: namespace llvm::MyTarget {
+// CHECK: enum {
+// CHECK: LOAD_STACK_GUARD = [[LOAD_STACK_GUARD_OPCODE:[0-9]+]],
+// CHECK: PREALLOCATED_ARG = [[PREALLOCATED_ARG_OPCODE:[0-9]+]],
+// CHECK: PATCHABLE_EVENT_CALL = [[PATCHABLE_EVENT_CALL_OPCODE:[0-9]+]],
+// CHECK: PATCHABLE_TYPED_EVENT_CALL = [[PATCHABLE_TYPED_EVENT_CALL_OPCODE:[0-9]+]],
+
+// Make sure no enum entry is emitted for MY_LOAD_STACK_GUARD
+// CHECK: G_UBFX = [[G_UBFX_OPCODE:[0-9]+]],
+// CHECK-NEXT: MY_MOV = [[MY_MOV_OPCODE:[0-9]+]],
+// CHECK-NEXT: INSTRUCTION_LIST_END = [[INSTR_LIST_END_OPCODE:[0-9]+]]
+
+
+// CHECK: extern const MyTargetInstrTable MyTargetDescs = {
+// CHECK-NEXT: {
+// CHECK-NEXT: { [[MY_MOV_OPCODE]],	2,	1,	2,	0,	0,	0,	{{[0-9]+}},	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // MY_MOV
+// CHECK-NEXT: { [[G_UBFX_OPCODE]],	4,	1,	0,	0,	0,	0,	{{[0-9]+}},	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::PreISelOpcode)|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // G_UBFX
+
+// ONECASE: { [[LOAD_STACK_GUARD_OPCODE]],	1,	1,	0,	0,	0,	0,	[[LOAD_STACK_GUARD_OP_ENTRY:[0-9]+]],	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::MayLoad)|(1ULL<<MCID::Rematerializable)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // MY_LOAD_STACK_GUARD
+
+// ALLCASES: { [[PATCHABLE_TYPED_EVENT_CALL_OPCODE]],	3,	0,	0,	0,	0,	0,	[[PATCHABLE_TYPED_EVENT_CALL_OP_ENTRY:[0-9]+]],	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::Call)|(1ULL<<MCID::MayLoad)|(1ULL<<MCID::MayStore)|(1ULL<<MCID::UsesCustomInserter)|(1ULL<<MCID::UnmodeledSideEffects)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // anonymous_
+// ALLCASES: { [[PATCHABLE_EVENT_CALL_OPCODE]],	2,	0,	0,	0,	0,	0,	[[PATCHABLE_EVENT_CALL_OP_ENTRY:[0-9]+]],	MyTargetImpOpBase + 0, 0|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::Call)|(1ULL<<MCID::MayLoad)|(1ULL<<MCID::MayStore)|(1ULL<<MCID::UsesCustomInserter)|(1ULL<<MCID::UnmodeledSideEffects)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // anonymous_
+// ALLCASES: { [[PREALLOCATED_ARG_OPCODE]],	3,	1,	0,	0,	0,	0,	[[PREALLOCATED_ARG_OP_ENTRY:[0-9]+]],	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::UsesCustomInserter)|(1ULL<<MCID::UnmodeledSideEffects)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // anonymous_
+// ALLCASES: { [[LOAD_STACK_GUARD_OPCODE]],	1,	1,	0,	0,	0,	0,	[[LOAD_STACK_GUARD_OP_ENTRY:[0-9]+]],	MyTargetImpOpBase + 0,	0|(1ULL<<MCID::Pseudo)|(1ULL<<MCID::MayLoad)|(1ULL<<MCID::Rematerializable)|(1ULL<<MCID::ExtraSrcRegAllocReq)|(1ULL<<MCID::ExtraDefRegAllocReq), 0x0ULL },  // anonymous_
+
+// CHECK: /* 0 */ { -1, 0, MCOI::OPERAND_UNKNOWN, 0 },
+
+// ONECASE: /* [[LOAD_STACK_GUARD_OP_ENTRY]] */ { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
+
+// ALLCASES: /* [[LOAD_STACK_GUARD_OP_ENTRY]] */ { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 },
+// ALLCASES: /* [[PREALLOCATED_ARG_OP_ENTRY]] */ { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 }, { -1, 0, MCOI::OPERAND_IMMEDIATE, 0 }, { -1, 0, MCOI::OPERAND_IMMEDIATE, 0 },
+// ALLCASES: /* [[PATCHABLE_EVENT_CALL_OP_ENTRY]] */ { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 }, { -1, 0, MCOI::OPERAND_UNKNOWN, 0 },
+// ALLCASES: /* [[PATCHABLE_TYPED_EVENT_CALL_OP_ENTRY]] */ { -1, 0, MCOI::OPERAND_UNKNOWN, 0 }, { MyTarget::XRegsRegClassID, 0, MCOI::OPERAND_REGISTER, 0 }, { -1, 0, MCOI::OPERAND_UNKNOWN, 0 },
+
+
+// CHECK: const char MyTargetInstrNameData[] = {
+// CHECK: /* {{[0-9]+}} */ "LOAD_STACK_GUARD\000"
+
+include "llvm/Target/Target.td"
+
+class MyReg<string n>
+  : Register<n> {
+  let Namespace = "MyTarget";
+}
+
+class MyClass<int size, list<ValueType> types, dag registers>
+  : RegisterClass<"MyTarget", types, size, registers> {
+  let Size = size;
+}
+
+def X0 : MyReg<"x0">;
+def X1 : MyReg<"x1">;
+def XRegs : RegisterClass<"MyTarget", [i64], 64, (add X0, X1)>;
+
+
+class TestInstruction : Instruction {
+  let Size = 2;
+  let Namespace = "MyTarget";
+  let hasSideEffects = false;
+}
+
+#ifdef ONECASE
+
+// Example setting the pointer register class manually
+def MY_LOAD_STACK_GUARD :
+  TargetSpecializedStandardPseudoInstruction<LOAD_STACK_GUARD> {
+  let Namespace = "MyTarget";
+  let OutOperandList = (outs XRegs:$dst);
+}
+
+#endif
+
+#ifdef ALLCASES
+
+defm my_remaps : RemapAllTargetPseudoPointerOperands<XRegs>;
+
+#endif
+
+
+#ifdef ERROR
+
+def MY_LOAD_STACK_GUARD_0 : TargetSpecializedStandardPseudoInstruction<LOAD_STACK_GUARD>;
+
+// ERROR: :[[@LINE+1]]:5: error: multiple overrides of 'LOAD_STACK_GUARD' defined
+def MY_LOAD_STACK_GUARD_1 : TargetSpecializedStandardPseudoInstruction<LOAD_STACK_GUARD>;
+
+#endif
+
+def MY_MOV : TestInstruction {
+  let OutOperandList = (outs XRegs:$dst);
+  let InOperandList = (ins XRegs:$src);
+  let AsmString = "my_mov $dst, $src";
+}
+
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
diff --git a/llvm/test/Transforms/LowerTypeTests/function-weak.ll b/llvm/test/Transforms/LowerTypeTests/function-weak.ll
index 4ea03b6c2c1fa..dbbe8fa4a0a9a 100644
--- a/llvm/test/Transforms/LowerTypeTests/function-weak.ll
+++ b/llvm/test/Transforms/LowerTypeTests/function-weak.ll
@@ -32,10 +32,10 @@ target triple = "x86_64-unknown-linux-gnu"
 declare !type !0 extern_weak void @f()
 
 ; CHECK: define zeroext i1 @check_f()
-define zeroext i1 @check_f() {
+define zeroext i1 @check_f() !prof !{!"function_entry_count", i32 10} {
 entry:
 ; CHECK: [[CMP:%.*]] = icmp ne ptr @f, null
-; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null
+; CHECK: [[SEL:%.*]] = select i1 [[CMP]], ptr @[[JT:.*]], ptr null, !prof ![[SELPROF:[0-9]+]]
 ; CHECK: [[PTI:%.*]] = ptrtoint ptr [[SEL]] to i1
 ; CHECK: ret i1 [[PTI]]
   ret i1 ptrtoint (ptr @f to i1)
@@ -165,3 +165,4 @@ define i1 @foo(ptr %p) {
 ; CHECK-NEXT: }
 
 !0 = !{i32 0, !"typeid1"}
+; CHECK: ![[SELPROF]] = !{!"unknown", !"lowertypetests"}
\ No newline at end of file
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index e5025784d304d..35ec495b93ba2 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -80,6 +80,7 @@
 #include "TableGenBackends.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/TableGen/CodeGenHelpers.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
@@ -549,9 +550,8 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) {
   if (InstrMapVec.empty())
     return;
 
-  OS << "#ifdef GET_INSTRMAP_INFO\n";
-  OS << "#undef GET_INSTRMAP_INFO\n";
-  OS << "namespace llvm::" << NameSpace << " {\n\n";
+  IfDefEmitter IfDef(OS, "GET_INSTRMAP_INFO");
+  NamespaceEmitter NS(OS, ("llvm::" + NameSpace).str());
 
   // Emit coulumn field names and their values as enums.
   emitEnums(OS, Records);
@@ -574,6 +574,4 @@ void llvm::EmitMapTable(const RecordKeeper &Records, raw_ostream &OS) {
     // Emit map tables and the functions to query them.
     IMap.emitTablesWithFunc(OS);
   }
-  OS << "} // end namespace llvm::" << NameSpace << '\n';
-  OS << "#endif // GET_INSTRMAP_INFO\n\n";
 }
diff --git a/llvm/utils/TableGen/Common/CodeGenTarget.cpp b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
index c0daac127f71a..e080ca0aa0b31 100644
--- a/llvm/utils/TableGen/Common/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/Common/CodeGenTarget.cpp
@@ -283,15 +283,25 @@ void CodeGenTarget::ComputeInstrsByEnum() const {
   assert(EndOfPredefines == getNumFixedInstructions() &&
          "Missing generic opcode");
 
+  [[maybe_unused]] unsigned SkippedInsts = 0;
+
   for (const auto &[_, CGIUp] : InstMap) {
     const CodeGenInstruction *CGI = CGIUp.get();
     if (CGI->Namespace != "TargetOpcode") {
+
+      if (CGI->TheDef->isSubClassOf(
+              "TargetSpecializedStandardPseudoInstruction")) {
+        ++SkippedInsts;
+        continue;
+      }
+
       InstrsByEnum.push_back(CGI);
       NumPseudoInstructions += CGI->TheDef->getValueAsBit("isPseudo");
     }
   }
 
-  assert(InstrsByEnum.size() == InstMap.size() && "Missing predefined instr");
+  assert(InstrsByEnum.size() + SkippedInsts == InstMap.size() &&
+         "Missing predefined instr");
 
   // All of the instructions are now in random order based on the map iteration.
   llvm::sort(
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index 32994c12aa98b..d46c9d811753a 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -72,6 +72,13 @@ class InstrInfoEmitter {
   using OperandInfoListTy = std::vector<OperandInfoTy>;
   using OperandInfoMapTy = std::map<OperandInfoTy, unsigned>;
 
+  DenseMap<const CodeGenInstruction *, const CodeGenInstruction *>
+      TargetSpecializedPseudoInsts;
+
+  /// Compute mapping of opcodes which should have their definitions overridden
+  /// by a target version.
+  void buildTargetSpecializedPseudoInstsMap();
+
   /// Generate member functions in the target-specific GenInstrInfo class.
   ///
   /// This method is used to custom expand TIIPredicate definitions.
@@ -216,6 +223,10 @@ InstrInfoEmitter::CollectOperandInfo(OperandInfoListTy &OperandInfoList,
   const CodeGenTarget &Target = CDP.getTargetInfo();
   unsigned Offset = 0;
   for (const CodeGenInstruction *Inst : Target.getInstructions()) {
+    auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst);
+    if (OverrideEntry != TargetSpecializedPseudoInsts.end())
+      Inst = OverrideEntry->second;
+
     OperandInfoTy OperandInfo = GetOperandInfo(*Inst);
     if (OperandInfoMap.try_emplace(OperandInfo, Offset).second) {
       OperandInfoList.push_back(OperandInfo);
@@ -859,6 +870,25 @@ void InstrInfoEmitter::emitTIIHelperMethods(raw_ostream &OS,
   }
 }
 
+void InstrInfoEmitter::buildTargetSpecializedPseudoInstsMap() {
+  ArrayRef<const Record *> SpecializedInsts = Records.getAllDerivedDefinitions(
+      "TargetSpecializedStandardPseudoInstruction");
+  const CodeGenTarget &Target = CDP.getTargetInfo();
+
+  for (const Record *SpecializedRec : SpecializedInsts) {
+    const CodeGenInstruction &SpecializedInst =
+        Target.getInstruction(SpecializedRec);
+    const Record *BaseInstRec = SpecializedRec->getValueAsDef("Instruction");
+
+    const CodeGenInstruction &BaseInst = Target.getInstruction(BaseInstRec);
+
+    if (!TargetSpecializedPseudoInsts.insert({&BaseInst, &SpecializedInst})
+             .second)
+      PrintFatalError(SpecializedRec, "multiple overrides of '" +
+                                          BaseInst.getName() + "' defined");
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Main Output.
 //===----------------------------------------------------------------------===//
@@ -881,6 +911,8 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
 
   // Collect all of the operand info records.
   Timer.startTimer("Collect operand info");
+  buildTargetSpecializedPseudoInstsMap();
+
   OperandInfoListTy OperandInfoList;
   OperandInfoMapTy OperandInfoMap;
   unsigned OperandInfoSize =
@@ -963,6 +995,11 @@ void InstrInfoEmitter::run(raw_ostream &OS) {
     for (const CodeGenInstruction *Inst : reverse(NumberedInstructions)) {
       // Keep a list of the instruction names.
       InstrNames.add(Inst->getName());
+
+      auto OverrideEntry = TargetSpecializedPseudoInsts.find(Inst);
+      if (OverrideEntry != TargetSpecializedPseudoInsts.end())
+        Inst = OverrideEntry->second;
+
       // Emit the record into the table.
       emitRecord(*Inst, --Num, InstrInfo, EmittedLists, OperandInfoMap, OS);
     }
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
index 054c13a88a552..6b0c84d31d1ba 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOpsInterfaces.td
@@ -44,4 +44,35 @@ def PartialEntityAccessOpInterface : OpInterface<"PartialEntityAccessOpInterface
   ];
 }
 
+def AddressOfGlobalOpInterface : OpInterface<"AddressOfGlobalOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for operations that compute the address of a global variable
+    or symbol.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Get the symbol reference to the global", "::mlir::SymbolRefAttr",
+      "getSymbol", (ins)>,
+  ];
+}
+
+def GlobalVariableOpInterface : OpInterface<"GlobalVariableOpInterface"> {
+  let cppNamespace = "::mlir::acc";
+
+  let description = [{
+    An interface for operations that define global variables. This interface
+    provides a uniform way to query properties of global variables across
+    different dialects.
+  }];
+
+  let methods = [
+    InterfaceMethod<"Check if the global variable is constant", "bool",
+      "isConstant", (ins), [{
+        return false;
+      }]>,
+  ];
+}
+
 #endif // OPENACC_OPS_INTERFACES
diff --git a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
index 970d9304d8289..cad78df2fbb0b 100644
--- a/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/OpenACC/Transforms/Passes.td
@@ -97,7 +97,14 @@ def ACCImplicitRoutine : Pass<"acc-implicit-routine", "mlir::ModuleOp"> {
            "mlir::acc::DeviceType::None",
            "Target device type for implicit routine generation. "
            "Ensures that `acc routine` device_type clauses are "
-           "properly considered not just default clauses.">
+           "properly considered not just default clauses.",
+           [{::llvm::cl::values(
+              clEnumValN(mlir::acc::DeviceType::None, "none", "none"),
+              clEnumValN(mlir::acc::DeviceType::Host, "host", "host"),
+              clEnumValN(mlir::acc::DeviceType::Multicore, "multicore", "multicore"),
+              clEnumValN(mlir::acc::DeviceType::Nvidia, "nvidia", "nvidia"),
+              clEnumValN(mlir::acc::DeviceType::Radeon, "radeon", "radeon"))
+           }]>
   ];
 }
 
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 8c9c137b8aebb..5749e6ded73ba 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -211,6 +211,24 @@ struct LLVMPointerPointerLikeModel
   Type getElementType(Type pointer) const { return Type(); }
 };
 
+struct MemrefAddressOfGlobalModel
+    : public AddressOfGlobalOpInterface::ExternalModel<
+          MemrefAddressOfGlobalModel, memref::GetGlobalOp> {
+  SymbolRefAttr getSymbol(Operation *op) const {
+    auto getGlobalOp = cast<memref::GetGlobalOp>(op);
+    return getGlobalOp.getNameAttr();
+  }
+};
+
+struct MemrefGlobalVariableModel
+    : public GlobalVariableOpInterface::ExternalModel<MemrefGlobalVariableModel,
+                                                      memref::GlobalOp> {
+  bool isConstant(Operation *op) const {
+    auto globalOp = cast<memref::GlobalOp>(op);
+    return globalOp.getConstant();
+  }
+};
+
 /// Helper function for any of the times we need to modify an ArrayAttr based on
 /// a device type list.  Returns a new ArrayAttr with all of the
 /// existingDeviceTypes, plus the effective new ones(or an added none if hte new
@@ -302,6 +320,11 @@ void OpenACCDialect::initialize() {
       MemRefPointerLikeModel<UnrankedMemRefType>>(*getContext());
   LLVM::LLVMPointerType::attachInterface<LLVMPointerPointerLikeModel>(
       *getContext());
+
+  // Attach operation interfaces
+  memref::GetGlobalOp::attachInterface<MemrefAddressOfGlobalModel>(
+      *getContext());
+  memref::GlobalOp::attachInterface<MemrefGlobalVariableModel>(*getContext());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
index 177c8680b0040..c8c2bb96b0539 100644
--- a/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/unittests/Dialect/OpenACC/CMakeLists.txt
@@ -1,5 +1,6 @@
 add_mlir_unittest(MLIROpenACCTests
   OpenACCOpsTest.cpp
+  OpenACCOpsInterfacesTest.cpp
   OpenACCUtilsTest.cpp
 )
 mlir_target_link_libraries(MLIROpenACCTests
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp
new file mode 100644
index 0000000000000..261f5c513ea24
--- /dev/null
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsInterfacesTest.cpp
@@ -0,0 +1,95 @@
+//===- OpenACCOpsInterfacesTest.cpp - Unit tests for OpenACC interfaces --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OwningOpRef.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+//===----------------------------------------------------------------------===//
+// Test Fixture
+//===----------------------------------------------------------------------===//
+
+class OpenACCOpsInterfacesTest : public ::testing::Test {
+protected:
+  OpenACCOpsInterfacesTest()
+      : context(), builder(&context), loc(UnknownLoc::get(&context)) {
+    context.loadDialect<acc::OpenACCDialect, memref::MemRefDialect>();
+  }
+
+  MLIRContext context;
+  OpBuilder builder;
+  Location loc;
+};
+
+//===----------------------------------------------------------------------===//
+// GlobalVariableOpInterface Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceNonConstant) {
+  // Test that a non-constant global returns false for isConstant()
+
+  auto memrefType = MemRefType::get({10}, builder.getF32Type());
+  OwningOpRef<memref::GlobalOp> globalOp = memref::GlobalOp::create(
+      builder, loc,
+      /*sym_name=*/builder.getStringAttr("mutable_global"),
+      /*sym_visibility=*/builder.getStringAttr("private"),
+      /*type=*/TypeAttr::get(memrefType),
+      /*initial_value=*/Attribute(),
+      /*constant=*/UnitAttr(),
+      /*alignment=*/IntegerAttr());
+
+  auto globalVarIface =
+      dyn_cast<GlobalVariableOpInterface>(globalOp->getOperation());
+  ASSERT_TRUE(globalVarIface != nullptr);
+  EXPECT_FALSE(globalVarIface.isConstant());
+}
+
+TEST_F(OpenACCOpsInterfacesTest, GlobalVariableOpInterfaceConstant) {
+  // Test that a constant global returns true for isConstant()
+
+  auto memrefType = MemRefType::get({5}, builder.getI32Type());
+  OwningOpRef<memref::GlobalOp> constantGlobalOp = memref::GlobalOp::create(
+      builder, loc,
+      /*sym_name=*/builder.getStringAttr("constant_global"),
+      /*sym_visibility=*/builder.getStringAttr("public"),
+      /*type=*/TypeAttr::get(memrefType),
+      /*initial_value=*/Attribute(),
+      /*constant=*/builder.getUnitAttr(),
+      /*alignment=*/IntegerAttr());
+
+  auto globalVarIface =
+      dyn_cast<GlobalVariableOpInterface>(constantGlobalOp->getOperation());
+  ASSERT_TRUE(globalVarIface != nullptr);
+  EXPECT_TRUE(globalVarIface.isConstant());
+}
+
+//===----------------------------------------------------------------------===//
+// AddressOfGlobalOpInterface Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCOpsInterfacesTest, AddressOfGlobalOpInterfaceGetSymbol) {
+  // Test that getSymbol() returns the correct symbol reference
+
+  auto memrefType = MemRefType::get({5}, builder.getI32Type());
+  const auto *symbolName = "test_global_symbol";
+
+  OwningOpRef<memref::GetGlobalOp> getGlobalOp = memref::GetGlobalOp::create(
+      builder, loc, memrefType, FlatSymbolRefAttr::get(&context, symbolName));
+
+  auto addrOfGlobalIface =
+      dyn_cast<AddressOfGlobalOpInterface>(getGlobalOp->getOperation());
+  ASSERT_TRUE(addrOfGlobalIface != nullptr);
+  EXPECT_EQ(addrOfGlobalIface.getSymbol().getLeafReference(), symbolName);
+}
diff --git a/orc-rt/lib/executor/TaskDispatcher.cpp b/orc-rt/lib/executor/TaskDispatcher.cpp
index 5f34627fb5150..9e42a66c2ea94 100644
--- a/orc-rt/lib/executor/TaskDispatcher.cpp
+++ b/orc-rt/lib/executor/TaskDispatcher.cpp
@@ -1,4 +1,4 @@
-//===- TaskDispatch.cpp ---------------------------------------------------===//
+//===- TaskDispatcher.cpp -------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Contains the implementation of APIs in the orc-rt/TaskDispatch.h header.
+// Contains the implementation of APIs in the orc-rt/TaskDispatcher.h header.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp
index d6d301302220d..4bf7e5df69654 100644
--- a/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp
+++ b/orc-rt/lib/executor/ThreadPoolTaskDispatcher.cpp
@@ -1,4 +1,4 @@
-//===- ThreadPoolTaskDispatch.cpp -----------------------------------------===//
+//===- ThreadPoolTaskDispatcher.cpp ---------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatch.h
+// Contains the implementation of APIs in the orc-rt/ThreadPoolTaskDispatcher.h
 // header.
 //
 //===----------------------------------------------------------------------===//
diff --git a/revert_patches.txt b/revert_patches.txt
index 9e465ba90ae6a..a3a76b6ac1e40 100644
--- a/revert_patches.txt
+++ b/revert_patches.txt
@@ -5,3 +5,6 @@ d57230c7 [AMDGPU][MC] Disallow op_sel in some VOP3P dot instructions (#100485)
 breaks build of ROCmValidationSuite
 [C2y] Support WG14 N3457, the __COUNTER__ macro (#162662)
 ---
+Shore will help land downstream
+[AMDGPU] Adding instruction specific features (#167809)
+---
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b027d82d98177..643f8ab03f724 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2874,6 +2874,10 @@ llvm_target_lib_list = [lib for lib in [
                 ["-gen-subtarget"],
                 "lib/Target/NVPTX/NVPTXGenSubtargetInfo.inc",
             ),
+            (
+                ["-gen-sd-node-info"],
+                "lib/Target/NVPTX/NVPTXGenSDNodeInfo.inc",
+            ),
         ],
     },
     {