From bba5951b6f9cd77047cafc554b20144b33602298 Mon Sep 17 00:00:00 2001
From: Shivam Gupta <shivam98.tkg@gmail.com>
Date: Tue, 18 Jun 2024 01:09:30 +0530
Subject: [PATCH 01/25] [MLIR] Fix an assert that contains a mistake in
 conditional operator (#95668)

This is described in (N2) https://pvs-studio.com/en/blog/posts/cpp/1126/
so caught by the PVS Studio analyzer.

Warning message -
V502 Perhaps the '?:' operator works in a different way than it was
expected. The '?:' operator has a lower priority than the '+' operator.
LoopEmitter.cpp 983
V502 Perhaps the '?:' operator works in a different way than it was
expected. The '?:' operator has a lower priority than the '+' operator.
LoopEmitter.cpp 1039

The assert should be
assert(bArgs.size() == reduc.size() + (needsUniv ? 1 : 0));
since + has higher precedence and ? has lower.

This further can be reduce to
assert(aArgs.size() == reduc.size() + needsUniv);
because needUniv is a bool value which is implicitly converted to 0 or
---
 .../lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
index 05883f1cefdf3..fe0e515a2d180 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/LoopEmitter.cpp
@@ -542,7 +542,7 @@ std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
   }
   // The remaining block arguments are user-provided reduction values and an
   // optional universal index. Make sure their sizes match.
-  assert(bArgs.size() == reduc.size() + needsUniv ? 1 : 0);
+  assert(bArgs.size() == reduc.size() + needsUniv);
   builder.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
 
   // Generates loop body.
@@ -560,7 +560,7 @@ std::pair<Operation *, Value> LoopEmitter::emitWhileLoopOverTensorsAtLvls(
   }
 
   // In-place update on reduction variable.
-  assert(aArgs.size() == reduc.size() + needsUniv ? 1 : 0);
+  assert(aArgs.size() == reduc.size() + needsUniv);
   for (unsigned i = 0, e = reduc.size(); i < e; i++)
     reduc[i] = aArgs[i];
 

From 3b997294d6117241477ab36be0595040f8278707 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 17 Jun 2024 21:44:52 +0200
Subject: [PATCH 02/25] AMDGPU: Remove .v2bf16 buffer atomic fadd intrinsics
 (#95783)

These are redundant with the unsuffixed versions, and have a name
collision with surprising behavior when the base intrinsic is used with
v2bf16.

The global and flat variants should be removed too, but those are complicated
due to using v2i16 in place of the natural v2bf16. Those cases can soon be
completely deleted in favor of atomicrmw.

The GlobalISel codegen change is broken and substitutes handling as bf16
for handling as f16, but it's a bug that this passed the IRTranslator in the first
place.
---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 44 +------------------
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |  1 -
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp |  1 -
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  1 -
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  9 ----
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |  2 -
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  4 --
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |  2 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  9 ----
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |  1 -
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  1 -
 .../test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll | 12 ++---
 ...mdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll | 10 +++--
 13 files changed, 15 insertions(+), 82 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6b69b39911a9..45f1092094572 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -1337,27 +1337,9 @@ def int_amdgcn_raw_ptr_buffer_atomic_cmpswap : Intrinsic<
 
 // gfx908 intrinsic
 def int_amdgcn_raw_buffer_atomic_fadd : AMDGPURawBufferAtomic<llvm_anyfloat_ty>;
+
+// Supports float and <2 x half> on gfx908. Supports v2bf16 on gfx90a, gfx940, gfx12+.
 def int_amdgcn_raw_ptr_buffer_atomic_fadd : AMDGPURawPtrBufferAtomic<llvm_anyfloat_ty>;
-// gfx12+ intrinsic
-def int_amdgcn_raw_buffer_atomic_fadd_v2bf16 : Intrinsic <
-  [llvm_v2bf16_ty],
-  [llvm_v2bf16_ty,
-   llvm_v4i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty],
- [ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1, 0>;
-def int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
-  [llvm_v2bf16_ty],
-  [llvm_v2bf16_ty,
-   AMDGPUBufferRsrcTy,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty],
- [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
-  ImmArg<ArgIndex<4>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
- AMDGPURsrcIntrinsic<1, 0>;
 
 class AMDGPUStructBufferAtomic<LLVMType data_ty = llvm_any_ty> : Intrinsic <
   [data_ty],
@@ -1434,28 +1416,6 @@ def int_amdgcn_struct_ptr_buffer_atomic_cmpswap : Intrinsic<
 // gfx908 intrinsic
 def int_amdgcn_struct_buffer_atomic_fadd : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
 def int_amdgcn_struct_ptr_buffer_atomic_fadd : AMDGPUStructPtrBufferAtomic<llvm_anyfloat_ty>;
-// gfx12 intrinsic
-def int_amdgcn_struct_buffer_atomic_fadd_v2bf16 : Intrinsic <
-  [llvm_v2bf16_ty],
-  [llvm_v2bf16_ty,
-   llvm_v4i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty],
-  [ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1, 0>;
-def int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16 : Intrinsic <
-  [llvm_v2bf16_ty],
-  [llvm_v2bf16_ty,
-   AMDGPUBufferRsrcTy,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty,
-   llvm_i32_ty],
-  [IntrArgMemOnly, NoCapture<ArgIndex<1>>,
-   ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>,
-  AMDGPURsrcIntrinsic<1, 0>;
 
 // gfx90a intrinsics
 def int_amdgcn_struct_buffer_atomic_fmin : AMDGPUStructBufferAtomic<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index 231db188e65dc..d81c18875eebd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -290,7 +290,6 @@ def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD, SIbuffer_atomic_fadd>;
-def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FADD_BF16, SIbuffer_atomic_fadd_bf16>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMIN, SIbuffer_atomic_fmin>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_FMAX, SIbuffer_atomic_fmax>;
 def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 18193d8807597..519e623306eb1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -5564,7 +5564,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
   NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
-  NODE_NAME_CASE(BUFFER_ATOMIC_FADD_BF16)
   NODE_NAME_CASE(BUFFER_ATOMIC_FMIN)
   NODE_NAME_CASE(BUFFER_ATOMIC_FMAX)
   NODE_NAME_CASE(BUFFER_ATOMIC_COND_SUB_U32)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index 71c4334029b43..206bb46b6c863 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -615,7 +615,6 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_CMPSWAP,
   BUFFER_ATOMIC_CSUB,
   BUFFER_ATOMIC_FADD,
-  BUFFER_ATOMIC_FADD_BF16,
   BUFFER_ATOMIC_FMIN,
   BUFFER_ATOMIC_FMAX,
   BUFFER_ATOMIC_COND_SUB_U32,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 973b6b8cce177..0c7b1968e551c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -6018,11 +6018,6 @@ static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
     return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD;
-  case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
-    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16;
   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
   case Intrinsic::amdgcn_struct_buffer_atomic_fmin:
@@ -7330,10 +7325,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
-  case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
     return legalizeBufferAtomic(MI, B, IntrID);
   case Intrinsic::amdgcn_rsq_clamp:
     return legalizeRsqClampIntrinsic(MI, MRI, B);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 7ebd674757fbc..313d53a1524d2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -3079,7 +3079,6 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     return;
   }
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
-  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
     applyDefaultMapping(OpdMapper);
@@ -4376,7 +4375,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD:
-  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FADD_BF16:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMIN:
   case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_FMAX: {
     // vdata_out
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index e84d39a2895c8..7b29d573b6101 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -269,7 +269,6 @@ def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_raw_buffer_atomic_cmpswap>;
@@ -287,7 +286,6 @@ def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_raw_ptr_buffer_atomic_cmpswap>;
@@ -305,7 +303,6 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
@@ -323,7 +320,6 @@ def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd>;
-def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmin>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_fmax>;
 def : SourceOfDivergence<int_amdgcn_struct_ptr_buffer_atomic_cmpswap>;
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 43e5434ea2700..f5b6de15e19e7 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1751,7 +1751,7 @@ let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
 
 let SubtargetPredicate = isGFX12Plus in {
-  defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd_bf16", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
+  defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_fadd", v2bf16, "BUFFER_ATOMIC_PK_ADD_BF16_VBUFFER">;
   defm : SIBufferAtomicPat_Common<"SIbuffer_atomic_cond_sub_u32", i32, "BUFFER_ATOMIC_COND_SUB_U32_VBUFFER", ["ret"]>;
 
   let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d9a163ded6bab..c436e03806dc8 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8833,17 +8833,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_raw_buffer_atomic_fadd:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd:
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
-  case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_raw_buffer_atomic_fadd_v2bf16:
-    return lowerRawBufferAtomicIntrin(Op, DAG,
-                                      AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
   case Intrinsic::amdgcn_struct_buffer_atomic_fadd:
   case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd:
     return lowerStructBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FADD);
-  case Intrinsic::amdgcn_struct_buffer_atomic_fadd_v2bf16:
-  case Intrinsic::amdgcn_struct_ptr_buffer_atomic_fadd_v2bf16:
-    return lowerStructBufferAtomicIntrin(Op, DAG,
-                                         AMDGPUISD::BUFFER_ATOMIC_FADD_BF16);
   case Intrinsic::amdgcn_raw_buffer_atomic_fmin:
   case Intrinsic::amdgcn_raw_ptr_buffer_atomic_fmin:
     return lowerRawBufferAtomicIntrin(Op, DAG, AMDGPUISD::BUFFER_ATOMIC_FMIN);
@@ -15841,7 +15833,6 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode *N,
   case AMDGPUISD::BUFFER_ATOMIC_CMPSWAP:
   case AMDGPUISD::BUFFER_ATOMIC_CSUB:
   case AMDGPUISD::BUFFER_ATOMIC_FADD:
-  case AMDGPUISD::BUFFER_ATOMIC_FADD_BF16:
   case AMDGPUISD::BUFFER_ATOMIC_FMIN:
   case AMDGPUISD::BUFFER_ATOMIC_FMAX:
     // Target-specific read-modify-write atomics are sources of divergence.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 6682763210411..9b9ff4a5d6996 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -222,7 +222,6 @@ defm SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
 defm SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
 defm SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
 defm SIbuffer_atomic_fadd : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD">;
-defm SIbuffer_atomic_fadd_bf16 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FADD_BF16">;
 defm SIbuffer_atomic_fmin : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMIN">;
 defm SIbuffer_atomic_fmax : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_FMAX">;
 defm SIbuffer_atomic_cond_sub_u32 : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_COND_SUB_U32">;
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index ba31027da92e8..e32bb8fec1f54 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -3892,7 +3892,6 @@ def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_FADD : BufferAtomicGenericInstruction;
-def G_AMDGPU_BUFFER_ATOMIC_FADD_BF16 : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_FMIN : BufferAtomicGenericInstruction;
 def G_AMDGPU_BUFFER_ATOMIC_FMAX : BufferAtomicGenericInstruction;
 
diff --git a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
index 2f29a1a9aa768..9f339af0f5580 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-atomics-gfx1200.ll
@@ -321,7 +321,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret_offset(<2 x half> %val,
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -339,7 +339,7 @@ define amdgpu_ps void @raw_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4 x i
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -356,7 +356,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret_offset(<2 x half> %
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret_offset:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, off, s[0:3], s4 offset:92 th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 92, i32 %soffset, i32 0)
@@ -372,7 +372,7 @@ define amdgpu_ps <2 x half> @raw_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: raw_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v1, s[0:3], s4 offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %ret = call <2 x half> @llvm.amdgcn.raw.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %voffset, i32 %soffset, i32 0)
@@ -388,7 +388,7 @@ define amdgpu_ps float @struct_buffer_atomic_add_v2f16_ret(<2 x half> %val, <4 x
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_ret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen th:TH_ATOMIC_RETURN
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %orig = call <2 x half> @llvm.amdgcn.struct.buffer.atomic.fadd.v2f16(<2 x half> %val, <4 x i32> %rsrc, i32 %vindex, i32 %voffset, i32 %soffset, i32 0)
@@ -406,7 +406,7 @@ define amdgpu_ps void @struct_buffer_atomic_add_v2f16_noret(<2 x half> %val, <4
 ;
 ; GFX12-GISEL-LABEL: struct_buffer_atomic_add_v2f16_noret:
 ; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_f16 v0, v[1:2], s[0:3], s4 idxen offen
+; GFX12-GISEL-NEXT:    buffer_atomic_pk_add_bf16 v0, v[1:2], s[0:3], s4 idxen offen
 ; GFX12-GISEL-NEXT:    s_nop 0
 ; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
index a3b83c346c1bb..f8caf84d5c51a 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ptr.buffer.atomic.fadd_rtn_errors.ll
@@ -17,10 +17,12 @@
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2f16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2F16-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
 ; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+
+; FIXME: These should fail when bfloat support is handled correctly
+; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
+; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/raw-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-RAW-V2BF16-GISEL %s
+; xUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=null %t/struct-ret-v2bf16-error.ll 2>&1 | FileCheck -check-prefix=ERR-STRUCT-V2BF16-GISEL %s
 
 ; Make sure buffer fadd atomics with return values are not selected
 ; for gfx908 where they do not work.

From 8930ac1bbe0bc50402da53b22501e17045a537ca Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Mon, 17 Jun 2024 21:51:25 +0200
Subject: [PATCH 03/25] AMDGPU: Cleanup selection patterns for buffer loads
 (#95378)

We should just support these for all register types.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 72 ++++++++++-------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td  | 16 ++---
 2 files changed, 39 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index f5b6de15e19e7..dff19b6a93286 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1419,27 +1419,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i32, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2bf16, "BUFFER_LOAD_DWORD">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i32, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, i64, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, f64, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4bf16, "BUFFER_LOAD_DWORDX2">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3f32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v3i32, "BUFFER_LOAD_DWORDX3">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4f32, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v4i32, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2i64, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v2f64, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8i16, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8f16, "BUFFER_LOAD_DWORDX4">;
-defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, v8bf16, "BUFFER_LOAD_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_LoadIntrinsicPat<SIbuffer_load, vt, "BUFFER_LOAD_DWORDX4">;
+}
 
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_byte, i32, "BUFFER_LOAD_SBYTE">;
 defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_short, i32, "BUFFER_LOAD_SSHORT">;
@@ -1530,27 +1524,21 @@ let OtherPredicates = [HasPackedD16VMem] in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i16, "BUFFER_STORE_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
 
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i32, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2bf16, "BUFFER_STORE_DWORD">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i32, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, i64, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, f64, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4bf16, "BUFFER_STORE_DWORDX2">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3f32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v3i32, "BUFFER_STORE_DWORDX3">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4f32, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v4i32, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2i64, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v2f64, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8f16, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8i16, "BUFFER_STORE_DWORDX4">;
-defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, v8bf16, "BUFFER_STORE_DWORDX4">;
+foreach vt = Reg32Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORD">;
+}
+
+foreach vt = Reg64Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX2">;
+}
+
+foreach vt = Reg96Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX3">;
+}
+
+foreach vt = Reg128Types.types in {
+defm : MUBUF_StoreIntrinsicPat<SIbuffer_store, vt, "BUFFER_STORE_DWORDX4">;
+}
 
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_byte, i32, "BUFFER_STORE_BYTE">;
 defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 3666976cf82f8..a8efe2b2ba35e 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -586,7 +586,9 @@ class RegisterTypes<list<ValueType> reg_types> {
 
 def Reg16Types : RegisterTypes<[i16, f16, bf16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, v2bf16, p2, p3, p5, p6]>;
-def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, v4i16, v4f16, v4bf16, p0]>;
+def Reg64Types : RegisterTypes<[i64, f64, v2i32, v2f32, p0, v4i16, v4f16, v4bf16]>;
+def Reg96Types : RegisterTypes<[v3i32, v3f32]>;
+def Reg128Types : RegisterTypes<[v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16]>;
 
 let HasVGPR = 1 in {
 // VOP3 and VINTERP can access 256 lo and 256 hi registers.
@@ -744,7 +746,7 @@ def Pseudo_SReg_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16,
   let BaseClassOrder = 10000;
 }
 
-def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", [v4i32, v2i64, v2f64, v8i16, v8f16, v8bf16], 32,
+def Pseudo_SReg_128 : SIRegisterClass<"AMDGPU", Reg128Types.types, 32,
   (add PRIVATE_RSRC_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
@@ -815,7 +817,7 @@ def SRegOrLds_32 : SIRegisterClass<"AMDGPU", [i32, f32, i16, f16, bf16, v2i16, v
   let HasSGPR = 1;
 }
 
-def SGPR_64 : SIRegisterClass<"AMDGPU", [v2i32, i64, v2f32, f64, v4i16, v4f16, v4bf16], 32,
+def SGPR_64 : SIRegisterClass<"AMDGPU", Reg64Types.types, 32,
                             (add SGPR_64Regs)> {
   let CopyCost = 1;
   let AllocationPriority = 1;
@@ -905,8 +907,8 @@ multiclass SRegClass<int numRegs,
   }
 }
 
-defm "" : SRegClass<3, [v3i32, v3f32], SGPR_96Regs, TTMP_96Regs>;
-defm "" : SRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], SGPR_128Regs, TTMP_128Regs>;
+defm "" : SRegClass<3, Reg96Types.types, SGPR_96Regs, TTMP_96Regs>;
+defm "" : SRegClass<4, Reg128Types.types, SGPR_128Regs, TTMP_128Regs>;
 defm "" : SRegClass<5, [v5i32, v5f32], SGPR_160Regs, TTMP_160Regs>;
 defm "" : SRegClass<6, [v6i32, v6f32, v3i64, v3f64], SGPR_192Regs, TTMP_192Regs>;
 defm "" : SRegClass<7, [v7i32, v7f32], SGPR_224Regs, TTMP_224Regs>;
@@ -958,8 +960,8 @@ multiclass VRegClass<int numRegs, list<ValueType> regTypes, dag regList> {
 
 defm VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4bf16, v4i16, p0, p1, p4],
                                 (add VGPR_64)>;
-defm VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
-defm VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, v8i16, v8f16, v8bf16], (add VGPR_128)>;
+defm VReg_96 : VRegClass<3, Reg96Types.types, (add VGPR_96)>;
+defm VReg_128 : VRegClass<4, Reg128Types.types, (add VGPR_128)>;
 defm VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
 
 defm VReg_192 : VRegClass<6, [v6i32, v6f32, v3i64, v3f64], (add VGPR_192)>;

From 804335638078f5a7c2bd31847b6080763cb22159 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 17 Jun 2024 14:52:50 -0500
Subject: [PATCH 04/25] [Offload] Change HSA header search order (#95769)

Summary:
The HSA headers existed previously in `include/hsa.h` and were moved to
`include/hsa/hsa.h` in a later ROCm version. The include headers here
were originally designed to favor a newer one. However, this
unintentionally prevented the dyanmic HSA's `hsa.h` from being used if
both were present. This patch changes the order so it will be found
first.

Related to https://github.com/llvm/llvm-project/pull/95484.
---
 offload/plugins-nextgen/amdgpu/src/rtl.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index e678213df18ce..e6643d3260eb4 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -58,12 +58,12 @@
 #endif
 
 #if defined(__has_include)
-#if __has_include("hsa/hsa.h")
-#include "hsa/hsa.h"
-#include "hsa/hsa_ext_amd.h"
-#elif __has_include("hsa.h")
+#if __has_include("hsa.h")
 #include "hsa.h"
 #include "hsa_ext_amd.h"
+#elif __has_include("hsa/hsa.h")
+#include "hsa/hsa.h"
+#include "hsa/hsa_ext_amd.h"
 #endif
 #else
 #include "hsa/hsa.h"

From 77d8cfb3c50e3341d65af1f9e442004bbd77af9b Mon Sep 17 00:00:00 2001
From: Alexander Shaposhnikov
 <6532716+alexander-shaposhnikov@users.noreply.github.com>
Date: Mon, 17 Jun 2024 12:59:04 -0700
Subject: [PATCH 05/25] [Flang] Switch to common::visit more call sites
 (#90018)

Switch to common::visit more call sites.

Test plan: ninja check-all
---
 flang/include/flang/Evaluate/tools.h          |   4 +-
 flang/include/flang/Lower/DumpEvaluateExpr.h  |   2 +-
 flang/include/flang/Lower/PFTBuilder.h        |   7 +-
 flang/include/flang/Lower/Support/Utils.h     |  40 +++--
 .../include/flang/Optimizer/Support/Matcher.h |   5 +-
 flang/lib/Evaluate/intrinsics.cpp             |   2 +-
 flang/lib/Lower/Allocatable.cpp               |  23 +--
 flang/lib/Lower/Bridge.cpp                    |  98 +++++------
 flang/lib/Lower/CallInterface.cpp             |   6 +-
 flang/lib/Lower/ComponentPath.cpp             |   2 +-
 flang/lib/Lower/ConvertArrayConstructor.cpp   |  37 +++--
 flang/lib/Lower/ConvertCall.cpp               |   3 +-
 flang/lib/Lower/ConvertConstant.cpp           |   8 +-
 flang/lib/Lower/ConvertExpr.cpp               | 155 ++++++++++--------
 flang/lib/Lower/ConvertExprToHLFIR.cpp        |  23 +--
 flang/lib/Lower/ConvertType.cpp               |   2 +-
 flang/lib/Lower/DirectivesCommon.h            |   8 +-
 flang/lib/Lower/IO.cpp                        |  16 +-
 flang/lib/Lower/IterationSpace.cpp            |  39 ++---
 flang/lib/Lower/Mangler.cpp                   |   2 +-
 flang/lib/Lower/OpenACC.cpp                   |  41 ++---
 flang/lib/Lower/OpenMP/Clauses.cpp            |  24 +--
 flang/lib/Lower/OpenMP/OpenMP.cpp             |  27 +--
 flang/lib/Lower/OpenMP/Utils.cpp              |   2 +-
 flang/lib/Lower/PFTBuilder.cpp                |  51 +++---
 flang/lib/Lower/VectorSubscripts.cpp          |  95 +++++------
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp |   6 +-
 flang/lib/Semantics/check-acc-structure.cpp   |  14 +-
 flang/lib/Semantics/check-coarray.cpp         |   2 +-
 29 files changed, 391 insertions(+), 353 deletions(-)

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index 378a5fca03264..ea56a20633f0c 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -450,12 +450,12 @@ struct ExtractSubstringHelper {
 
   template <typename T>
   static std::optional<Substring> visit(const Designator<T> &e) {
-    return std::visit([](auto &&s) { return visit(s); }, e.u);
+    return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 
   template <typename T>
   static std::optional<Substring> visit(const Expr<T> &e) {
-    return std::visit([](auto &&s) { return visit(s); }, e.u);
+    return common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 };
 
diff --git a/flang/include/flang/Lower/DumpEvaluateExpr.h b/flang/include/flang/Lower/DumpEvaluateExpr.h
index c67df245359e3..88f53e96a81c2 100644
--- a/flang/include/flang/Lower/DumpEvaluateExpr.h
+++ b/flang/include/flang/Lower/DumpEvaluateExpr.h
@@ -68,7 +68,7 @@ class DumpEvaluateExpr {
   }
   template <typename... A>
   void show(const std::variant<A...> &u) {
-    std::visit([&](const auto &v) { show(v); }, u);
+    Fortran::common::visit([&](const auto &v) { show(v); }, u);
   }
   template <typename A>
   void show(const std::vector<A> &x) {
diff --git a/flang/include/flang/Lower/PFTBuilder.h b/flang/include/flang/Lower/PFTBuilder.h
index c2b600c6b5d9b..7f1b93c564b4c 100644
--- a/flang/include/flang/Lower/PFTBuilder.h
+++ b/flang/include/flang/Lower/PFTBuilder.h
@@ -76,7 +76,7 @@ class ReferenceVariantBase {
   }
   template <typename VISITOR>
   constexpr auto visit(VISITOR &&visitor) const {
-    return std::visit(
+    return Fortran::common::visit(
         common::visitors{[&visitor](auto ref) { return visitor(ref.get()); }},
         u);
   }
@@ -494,7 +494,8 @@ struct Variable {
 
   /// Is this variable a global?
   bool isGlobal() const {
-    return std::visit([](const auto &x) { return x.isGlobal(); }, var);
+    return Fortran::common::visit([](const auto &x) { return x.isGlobal(); },
+                                  var);
   }
 
   /// Is this a module or submodule variable?
@@ -504,7 +505,7 @@ struct Variable {
   }
 
   const Fortran::semantics::Scope *getOwningScope() const {
-    return std::visit(
+    return Fortran::common::visit(
         common::visitors{
             [](const Nominal &x) { return &x.symbol->GetUltimate().owner(); },
             [](const AggregateStore &agg) { return &agg.getOwningScope(); }},
diff --git a/flang/include/flang/Lower/Support/Utils.h b/flang/include/flang/Lower/Support/Utils.h
index e791f3dbb221a..1cc74521e22d8 100644
--- a/flang/include/flang/Lower/Support/Utils.h
+++ b/flang/include/flang/Lower/Support/Utils.h
@@ -69,7 +69,8 @@ static Fortran::lower::SomeExpr ignoreEvConvert(const A &x) {
 inline Fortran::lower::SomeExpr
 ignoreEvConvert(const Fortran::evaluate::Expr<Fortran::evaluate::Type<
                     Fortran::common::TypeCategory::Integer, 8>> &x) {
-  return std::visit([](const auto &v) { return ignoreEvConvert(v); }, x.u);
+  return Fortran::common::visit(
+      [](const auto &v) { return ignoreEvConvert(v); }, x.u);
 }
 
 /// Zip two containers of the same size together and flatten the pairs. `flatZip
@@ -119,7 +120,8 @@ class HashEvaluateExpr {
     return 0u;
   }
   static unsigned getHashValue(const Fortran::evaluate::Subscript &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   static unsigned getHashValue(const Fortran::evaluate::Triplet &x) {
     return getHashValue(x.lower()) - getHashValue(x.upper()) * 5u -
@@ -154,7 +156,8 @@ class HashEvaluateExpr {
     return getHashValue(x.GetComponent()) * 13u;
   }
   static unsigned getHashValue(const Fortran::evaluate::DataRef &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   static unsigned getHashValue(const Fortran::evaluate::ComplexPart &x) {
     return getHashValue(x.complex()) - static_cast<unsigned>(x.part());
@@ -247,8 +250,9 @@ class HashEvaluateExpr {
     return getHashValue(sym.get());
   }
   static unsigned getHashValue(const Fortran::evaluate::Substring &x) {
-    return 61u * std::visit([&](const auto &p) { return getHashValue(p); },
-                            x.parent()) -
+    return 61u *
+               Fortran::common::visit(
+                   [&](const auto &p) { return getHashValue(p); }, x.parent()) -
            getHashValue(x.lower()) - (getHashValue(x.lower()) + 1u);
   }
   static unsigned
@@ -270,7 +274,8 @@ class HashEvaluateExpr {
   }
   static unsigned
   getHashValue(const Fortran::evaluate::ProcedureDesignator &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   static unsigned getHashValue(const Fortran::evaluate::ProcedureRef &x) {
     unsigned args = 13u;
@@ -321,15 +326,18 @@ class HashEvaluateExpr {
   }
   template <typename A>
   static unsigned getHashValue(const Fortran::evaluate::Expr<A> &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   static unsigned getHashValue(
       const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   template <typename A>
   static unsigned getHashValue(const Fortran::evaluate::Designator<A> &x) {
-    return std::visit([&](const auto &v) { return getHashValue(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return getHashValue(v); }, x.u);
   }
   template <int BITS>
   static unsigned
@@ -378,7 +386,7 @@ class IsEqualEvaluateExpr {
   }
   static bool isEqual(const Fortran::evaluate::Subscript &x,
                       const Fortran::evaluate::Subscript &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   static bool isEqual(const Fortran::evaluate::Triplet &x,
@@ -411,7 +419,7 @@ class IsEqualEvaluateExpr {
   }
   static bool isEqual(const Fortran::evaluate::DataRef &x,
                       const Fortran::evaluate::DataRef &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   static bool isEqual(const Fortran::evaluate::ComplexPart &x,
@@ -499,7 +507,7 @@ class IsEqualEvaluateExpr {
   }
   static bool isEqual(const Fortran::evaluate::Substring &x,
                       const Fortran::evaluate::Substring &y) {
-    return std::visit(
+    return Fortran::common::visit(
                [&](const auto &p, const auto &q) { return isEqual(p, q); },
                x.parent(), y.parent()) &&
            isEqual(x.lower(), y.lower()) && isEqual(x.upper(), y.upper());
@@ -529,7 +537,7 @@ class IsEqualEvaluateExpr {
   }
   static bool isEqual(const Fortran::evaluate::ProcedureDesignator &x,
                       const Fortran::evaluate::ProcedureDesignator &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   static bool isEqual(const Fortran::evaluate::ProcedureRef &x,
@@ -591,19 +599,19 @@ class IsEqualEvaluateExpr {
   template <typename A>
   static bool isEqual(const Fortran::evaluate::Expr<A> &x,
                       const Fortran::evaluate::Expr<A> &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   static bool
   isEqual(const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &x,
           const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   template <typename A>
   static bool isEqual(const Fortran::evaluate::Designator<A> &x,
                       const Fortran::evaluate::Designator<A> &y) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &v, const auto &w) { return isEqual(v, w); }, x.u, y.u);
   }
   template <int BITS>
diff --git a/flang/include/flang/Optimizer/Support/Matcher.h b/flang/include/flang/Optimizer/Support/Matcher.h
index da1d7c21f42c4..44672d3c0a072 100644
--- a/flang/include/flang/Optimizer/Support/Matcher.h
+++ b/flang/include/flang/Optimizer/Support/Matcher.h
@@ -13,6 +13,7 @@
 #ifndef FORTRAN_OPTIMIZER_SUPPORT_MATCHER_H
 #define FORTRAN_OPTIMIZER_SUPPORT_MATCHER_H
 
+#include "flang/Common/idioms.h"
 #include <variant>
 
 // Boilerplate CRTP class for a simplified type-casing syntactic sugar. This
@@ -23,10 +24,10 @@ template<class... Ts> struct matches : Ts... { using Ts::operator()...; };
 template<class... Ts> matches(Ts...) -> matches<Ts...>;
 template<typename N> struct matcher {
   template<typename... Ts> auto match(Ts... ts) {
-    return std::visit(matches{ts...}, static_cast<N*>(this)->matchee());
+    return Fortran::common::visit(matches{ts...}, static_cast<N*>(this)->matchee());
   }
   template<typename... Ts> auto match(Ts... ts) const {
-    return std::visit(matches{ts...}, static_cast<N const*>(this)->matchee());
+    return Fortran::common::visit(matches{ts...}, static_cast<N const*>(this)->matchee());
   }
 };
 // clang-format on
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index ace316174a892..1bba541e8e14a 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -2936,7 +2936,7 @@ static bool CheckForNonPositiveValues(FoldingContext &context,
   if (arg.Rank() > 0) {
     if (const Expr<SomeType> *expr{arg.UnwrapExpr()}) {
       if (const auto *intExpr{std::get_if<Expr<SomeInteger>>(&expr->u)}) {
-        std::visit(
+        Fortran::common::visit(
             [&](const auto &kindExpr) {
               using IntType = typename std::decay_t<decltype(kindExpr)>::Result;
               if (const auto *constArray{
diff --git a/flang/lib/Lower/Allocatable.cpp b/flang/lib/Lower/Allocatable.cpp
index 068f5d25967c9..77e02898ac9fb 100644
--- a/flang/lib/Lower/Allocatable.cpp
+++ b/flang/lib/Lower/Allocatable.cpp
@@ -350,10 +350,10 @@ class AllocateStmtHelper {
   void visitAllocateOptions() {
     for (const auto &allocOption :
          std::get<std::list<Fortran::parser::AllocOpt>>(stmt.t))
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::parser::StatOrErrmsg &statOrErr) {
-                std::visit(
+                Fortran::common::visit(
                     Fortran::common::visitors{
                         [&](const Fortran::parser::StatVariable &statVar) {
                           statExpr = Fortran::semantics::GetExpr(statVar);
@@ -898,15 +898,16 @@ void Fortran::lower::genDeallocateStmt(
   const Fortran::lower::SomeExpr *errMsgExpr = nullptr;
   for (const Fortran::parser::StatOrErrmsg &statOrErr :
        std::get<std::list<Fortran::parser::StatOrErrmsg>>(stmt.t))
-    std::visit(Fortran::common::visitors{
-                   [&](const Fortran::parser::StatVariable &statVar) {
-                     statExpr = Fortran::semantics::GetExpr(statVar);
-                   },
-                   [&](const Fortran::parser::MsgVariable &errMsgVar) {
-                     errMsgExpr = Fortran::semantics::GetExpr(errMsgVar);
-                   },
-               },
-               statOrErr.u);
+    Fortran::common::visit(
+        Fortran::common::visitors{
+            [&](const Fortran::parser::StatVariable &statVar) {
+              statExpr = Fortran::semantics::GetExpr(statVar);
+            },
+            [&](const Fortran::parser::MsgVariable &errMsgVar) {
+              errMsgExpr = Fortran::semantics::GetExpr(errMsgVar);
+            },
+        },
+        statOrErr.u);
   ErrorManager errorManager;
   errorManager.init(converter, loc, statExpr, errMsgExpr);
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 24a57812ba104..423c418889a00 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -302,7 +302,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     bool hasMainProgram = false;
     const Fortran::semantics::Symbol *globalOmpRequiresSymbol = nullptr;
     for (Fortran::lower::pft::Program::Units &u : pft.getUnits()) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](Fortran::lower::pft::FunctionLikeUnit &f) {
                 if (f.isMainProgram())
@@ -336,7 +336,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
 
     // Primary translation pass.
     for (Fortran::lower::pft::Program::Units &u : pft.getUnits()) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](Fortran::lower::pft::FunctionLikeUnit &f) { lowerFunc(f); },
               [&](Fortran::lower::pft::ModuleLikeUnit &m) { lowerMod(m); },
@@ -2062,7 +2062,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
           handleLocalitySpecs(info);
 
         for (const auto *dir : dirs) {
-          std::visit(
+          Fortran::common::visit(
               Fortran::common::visitors{
                   [&](const Fortran::parser::CompilerDirective::VectorAlways
                           &d) { addLoopAnnotationAttr(info); },
@@ -2433,7 +2433,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   }
 
   void genFIR(const Fortran::parser::ForallAssignmentStmt &stmt) {
-    std::visit([&](const auto &x) { genFIR(x); }, stmt.u);
+    Fortran::common::visit([&](const auto &x) { genFIR(x); }, stmt.u);
   }
 
   void genFIR(const Fortran::parser::EndForallStmt &) {
@@ -2494,7 +2494,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             forall.t));
     for (const Fortran::parser::ForallBodyConstruct &s :
          std::get<std::list<Fortran::parser::ForallBodyConstruct>>(forall.t)) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::parser::WhereConstruct &b) { genFIR(b); },
               [&](const Fortran::common::Indirection<
@@ -2617,7 +2617,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void genFIR(const Fortran::parser::CompilerDirective &dir) {
     Fortran::lower::pft::Evaluation &eval = getEval();
 
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::parser::CompilerDirective::VectorAlways &) {
               attachDirectiveToLoop(dir, &eval);
@@ -3198,7 +3198,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
         const auto &rank = std::get<Fortran::parser::SelectRankCaseStmt::Rank>(
             rankCaseStmt->t);
         assert(e->block && "missing SelectRankCaseStmt block");
-        std::visit(
+        Fortran::common::visit(
             Fortran::common::visitors{
                 [&](const Fortran::parser::ScalarIntConstantExpr &rankExpr) {
                   blockList.emplace_back(e->block);
@@ -3229,9 +3229,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
            "selector should not yet be set");
     Fortran::lower::StatementContext &stmtCtx =
         activeConstructStack.back().stmtCtx;
-    const Fortran::lower::SomeExpr *selectorExpr =
-        std::visit([](const auto &x) { return Fortran::semantics::GetExpr(x); },
-                   std::get<Fortran::parser::Selector>(selectRankStmt.t).u);
+    const Fortran::lower::SomeExpr *selectorExpr = Fortran::common::visit(
+        [](const auto &x) { return Fortran::semantics::GetExpr(x); },
+        std::get<Fortran::parser::Selector>(selectRankStmt.t).u);
     assert(selectorExpr && "failed to retrieve selector expr");
     hlfir::Entity selector = Fortran::lower::convertExprToHLFIR(
         loc, *this, *selectorExpr, localSymbols, stmtCtx);
@@ -3663,7 +3663,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     Fortran::parser::Label errLabel{};
     bool hasIostat{};
     for (const auto &spec : specList) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::parser::EndLabel &label) {
                 endLabel = label.v;
@@ -4373,7 +4373,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   void genAssignment(const Fortran::evaluate::Assignment &assign) {
     mlir::Location loc = toLocation();
     if (lowerToHighLevelFIR()) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::evaluate::Assignment::Intrinsic &) {
                 genDataAssignment(assign, /*userDefinedAssignment=*/nullptr);
@@ -4401,7 +4401,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       explicitIterSpace.genLoopNest();
     }
     Fortran::lower::StatementContext stmtCtx;
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             // [1] Plain old assignment.
             [&](const Fortran::evaluate::Assignment::Intrinsic &) {
@@ -4670,7 +4670,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     }
   }
   void genFIR(const Fortran::parser::WhereBodyConstruct &body) {
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::parser::Statement<
                 Fortran::parser::AssignmentStmt> &stmt) {
@@ -5386,18 +5386,19 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     // The intrinsic module scope, if present, is the first scope.
     const Fortran::semantics::Scope *intrinsicModuleScope = nullptr;
     for (Fortran::lower::pft::Program::Units &u : pft.getUnits()) {
-      std::visit(Fortran::common::visitors{
-                     [&](Fortran::lower::pft::FunctionLikeUnit &f) {
-                       intrinsicModuleScope = &f.getScope().parent();
-                     },
-                     [&](Fortran::lower::pft::ModuleLikeUnit &m) {
-                       intrinsicModuleScope = &m.getScope().parent();
-                     },
-                     [&](Fortran::lower::pft::BlockDataUnit &b) {},
-                     [&](Fortran::lower::pft::CompilerDirectiveUnit &d) {},
-                     [&](Fortran::lower::pft::OpenACCDirectiveUnit &d) {},
-                 },
-                 u);
+      Fortran::common::visit(
+          Fortran::common::visitors{
+              [&](Fortran::lower::pft::FunctionLikeUnit &f) {
+                intrinsicModuleScope = &f.getScope().parent();
+              },
+              [&](Fortran::lower::pft::ModuleLikeUnit &m) {
+                intrinsicModuleScope = &m.getScope().parent();
+              },
+              [&](Fortran::lower::pft::BlockDataUnit &b) {},
+              [&](Fortran::lower::pft::CompilerDirectiveUnit &d) {},
+              [&](Fortran::lower::pft::OpenACCDirectiveUnit &d) {},
+          },
+          u);
       if (intrinsicModuleScope) {
         while (!intrinsicModuleScope->IsGlobal())
           intrinsicModuleScope = &intrinsicModuleScope->parent();
@@ -5531,7 +5532,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       analyzeExplicitSpace</*LHS=*/true>(lhs);
       analyzeExplicitSpace(rhs);
     };
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::ProcedureRef &procRef) {
               // Ensure the procRef expressions are the one being visited.
@@ -5549,7 +5550,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     explicitIterSpace.endAssign();
   }
   void analyzeExplicitSpace(const Fortran::parser::ForallAssignmentStmt &stmt) {
-    std::visit([&](const auto &s) { analyzeExplicitSpace(s); }, stmt.u);
+    Fortran::common::visit([&](const auto &s) { analyzeExplicitSpace(s); },
+                           stmt.u);
   }
   void analyzeExplicitSpace(const Fortran::parser::AssignmentStmt &s) {
     analyzeExplicitSpace(s.typedAssignment->v.operator->());
@@ -5594,13 +5596,14 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       analyzeExplicitSpace(e);
   }
   void analyzeExplicitSpace(const Fortran::parser::WhereBodyConstruct &body) {
-    std::visit(Fortran::common::visitors{
-                   [&](const Fortran::common::Indirection<
-                       Fortran::parser::WhereConstruct> &wc) {
-                     analyzeExplicitSpace(wc.value());
-                   },
-                   [&](const auto &s) { analyzeExplicitSpace(s.statement); }},
-               body.u);
+    Fortran::common::visit(
+        Fortran::common::visitors{
+            [&](const Fortran::common::Indirection<
+                Fortran::parser::WhereConstruct> &wc) {
+              analyzeExplicitSpace(wc.value());
+            },
+            [&](const auto &s) { analyzeExplicitSpace(s.statement); }},
+        body.u);
   }
   void analyzeExplicitSpace(const Fortran::parser::MaskedElsewhereStmt &stmt) {
     const Fortran::lower::SomeExpr *exp = Fortran::semantics::GetExpr(
@@ -5651,16 +5654,17 @@ class FirConverter : public Fortran::lower::AbstractConverter {
             .statement);
     for (const Fortran::parser::ForallBodyConstruct &s :
          std::get<std::list<Fortran::parser::ForallBodyConstruct>>(forall.t)) {
-      std::visit(Fortran::common::visitors{
-                     [&](const Fortran::common::Indirection<
-                         Fortran::parser::ForallConstruct> &b) {
-                       analyzeExplicitSpace(b.value());
-                     },
-                     [&](const Fortran::parser::WhereConstruct &w) {
-                       analyzeExplicitSpace(w);
-                     },
-                     [&](const auto &b) { analyzeExplicitSpace(b.statement); }},
-                 s.u);
+      Fortran::common::visit(
+          Fortran::common::visitors{
+              [&](const Fortran::common::Indirection<
+                  Fortran::parser::ForallConstruct> &b) {
+                analyzeExplicitSpace(b.value());
+              },
+              [&](const Fortran::parser::WhereConstruct &w) {
+                analyzeExplicitSpace(w);
+              },
+              [&](const auto &b) { analyzeExplicitSpace(b.statement); }},
+          s.u);
     }
     analyzeExplicitSpacePop();
   }
@@ -5715,7 +5719,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
   std::string getConstantExprManglePrefix(mlir::Location loc,
                                           const Fortran::lower::SomeExpr &expr,
                                           mlir::Type eleTy) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &x) -> std::string {
           using T = std::decay_t<decltype(x)>;
           if constexpr (Fortran::common::HasMember<
@@ -5730,7 +5734,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
               fir::emitFatalError(loc,
                                   "non a constant derived type expression");
             } else {
-              return std::visit(
+              return Fortran::common::visit(
                   [&](const auto &someKind) -> std::string {
                     using T = std::decay_t<decltype(someKind)>;
                     using TK = Fortran::evaluate::Type<T::Result::category,
diff --git a/flang/lib/Lower/CallInterface.cpp b/flang/lib/Lower/CallInterface.cpp
index 5f1d69c1de7ac..75a86273250d1 100644
--- a/flang/lib/Lower/CallInterface.cpp
+++ b/flang/lib/Lower/CallInterface.cpp
@@ -187,7 +187,7 @@ asImplicitArg(Fortran::evaluate::characteristics::DummyDataObject &&dummy) {
 
 static Fortran::evaluate::characteristics::DummyArgument
 asImplicitArg(Fortran::evaluate::characteristics::DummyArgument &&dummy) {
-  return std::visit(
+  return Fortran::common::visit(
       Fortran::common::visitors{
           [&](Fortran::evaluate::characteristics::DummyDataObject &obj) {
             return Fortran::evaluate::characteristics::DummyArgument(
@@ -843,7 +843,7 @@ class Fortran::lower::CallInterfaceImpl {
     for (auto pair : llvm::zip(procedure.dummyArguments, argumentEntities)) {
       const Fortran::evaluate::characteristics::DummyArgument
           &argCharacteristics = std::get<0>(pair);
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const auto &dummy) {
                 const auto &entity = getDataObjectEntity(std::get<1>(pair));
@@ -877,7 +877,7 @@ class Fortran::lower::CallInterfaceImpl {
     for (auto pair : llvm::zip(procedure.dummyArguments, argumentEntities)) {
       const Fortran::evaluate::characteristics::DummyArgument
           &argCharacteristics = std::get<0>(pair);
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::evaluate::characteristics::DummyDataObject
                       &dummy) {
diff --git a/flang/lib/Lower/ComponentPath.cpp b/flang/lib/Lower/ComponentPath.cpp
index d20ea23153102..5bdbca6062e6d 100644
--- a/flang/lib/Lower/ComponentPath.cpp
+++ b/flang/lib/Lower/ComponentPath.cpp
@@ -36,7 +36,7 @@ void Fortran::lower::ComponentPath::clear() {
 
 bool Fortran::lower::isRankedArrayAccess(const Fortran::evaluate::ArrayRef &x) {
   for (const Fortran::evaluate::Subscript &sub : x.subscript()) {
-    if (std::visit(
+    if (Fortran::common::visit(
             Fortran::common::visitors{
                 [&](const Fortran::evaluate::Triplet &) { return true; },
                 [&](const Fortran::evaluate::IndirectSubscriptIntegerExpr &e) {
diff --git a/flang/lib/Lower/ConvertArrayConstructor.cpp b/flang/lib/Lower/ConvertArrayConstructor.cpp
index 341fad9a5e43c..3c43cd20eb080 100644
--- a/flang/lib/Lower/ConvertArrayConstructor.cpp
+++ b/flang/lib/Lower/ConvertArrayConstructor.cpp
@@ -438,7 +438,7 @@ class ArrayCtorLoweringStrategy {
 
   void pushValue(mlir::Location loc, fir::FirOpBuilder &builder,
                  hlfir::Entity value) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](auto &impl) { return impl.pushValue(loc, builder, value); },
         implVariant);
   }
@@ -446,7 +446,7 @@ class ArrayCtorLoweringStrategy {
   mlir::Value startImpliedDo(mlir::Location loc, fir::FirOpBuilder &builder,
                              mlir::Value lower, mlir::Value upper,
                              mlir::Value stride) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](auto &impl) {
           return impl.startImpliedDo(loc, builder, lower, upper, stride);
         },
@@ -455,13 +455,13 @@ class ArrayCtorLoweringStrategy {
 
   hlfir::Entity finishArrayCtorLowering(mlir::Location loc,
                                         fir::FirOpBuilder &builder) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](auto &impl) { return impl.finishArrayCtorLowering(loc, builder); },
         implVariant);
   }
 
   void startImpliedDoScope(llvm::StringRef doName, mlir::Value indexValue) {
-    std::visit(
+    Fortran::common::visit(
         [&](auto &impl) {
           return impl.startImpliedDoScope(doName, indexValue);
         },
@@ -469,8 +469,8 @@ class ArrayCtorLoweringStrategy {
   }
 
   void endImpliedDoScope() {
-    std::visit([&](auto &impl) { return impl.endImpliedDoScope(); },
-               implVariant);
+    Fortran::common::visit([&](auto &impl) { return impl.endImpliedDoScope(); },
+                           implVariant);
   }
 
 private:
@@ -612,16 +612,17 @@ ArrayCtorAnalysis::ArrayCtorAnalysis(
         arrayValueListStack.pop_back_val();
     for (const Fortran::evaluate::ArrayConstructorValue<T> &acValue :
          *currentArrayValueList)
-      std::visit(Fortran::common::visitors{
-                     [&](const Fortran::evaluate::ImpliedDo<T> &impledDo) {
-                       arrayValueListStack.push_back(&impledDo.values());
-                       localNumberOfImpliedDo++;
-                     },
-                     [&](const Fortran::evaluate::Expr<T> &expr) {
-                       localNumberOfExpr++;
-                       anyArrayExpr = anyArrayExpr || expr.Rank() > 0;
-                     }},
-                 acValue.u);
+      Fortran::common::visit(
+          Fortran::common::visitors{
+              [&](const Fortran::evaluate::ImpliedDo<T> &impledDo) {
+                arrayValueListStack.push_back(&impledDo.values());
+                localNumberOfImpliedDo++;
+              },
+              [&](const Fortran::evaluate::Expr<T> &expr) {
+                localNumberOfExpr++;
+                anyArrayExpr = anyArrayExpr || expr.Rank() > 0;
+              }},
+          acValue.u);
     anyImpliedDo = anyImpliedDo || localNumberOfImpliedDo > 0;
 
     if (localNumberOfImpliedDo == 0) {
@@ -765,7 +766,7 @@ static void genAcValue(mlir::Location loc,
                                    impliedDoIndexValue);
 
   for (const auto &acValue : impledDo.values())
-    std::visit(
+    Fortran::common::visit(
         [&](const auto &x) {
           genAcValue(loc, converter, x, symMap, stmtCtx, arrayBuilder);
         },
@@ -787,7 +788,7 @@ hlfir::EntityWithAttributes Fortran::lower::ArrayConstructorBuilder<T>::gen(
       loc, converter, arrayCtorExpr, symMap, stmtCtx);
   // Run the array lowering strategy through the ac-values.
   for (const auto &acValue : arrayCtorExpr)
-    std::visit(
+    Fortran::common::visit(
         [&](const auto &x) {
           genAcValue(loc, converter, x, symMap, stmtCtx, arrayBuilder);
         },
diff --git a/flang/lib/Lower/ConvertCall.cpp b/flang/lib/Lower/ConvertCall.cpp
index b1dc41f3ca838..65a2ffbea5dd1 100644
--- a/flang/lib/Lower/ConvertCall.cpp
+++ b/flang/lib/Lower/ConvertCall.cpp
@@ -935,7 +935,8 @@ struct CallCleanUp {
     mlir::Value mustFree;
   };
   void genCleanUp(mlir::Location loc, fir::FirOpBuilder &builder) {
-    std::visit([&](auto &c) { c.genCleanUp(loc, builder); }, cleanUp);
+    Fortran::common::visit([&](auto &c) { c.genCleanUp(loc, builder); },
+                           cleanUp);
   }
   std::variant<CopyIn, ExprAssociate> cleanUp;
 };
diff --git a/flang/lib/Lower/ConvertConstant.cpp b/flang/lib/Lower/ConvertConstant.cpp
index a4ace40a3a1c4..3361817ee27ee 100644
--- a/flang/lib/Lower/ConvertConstant.cpp
+++ b/flang/lib/Lower/ConvertConstant.cpp
@@ -105,7 +105,7 @@ class DenseGlobalBuilder {
                                    const Fortran::lower::SomeExpr &initExpr,
                                    cuf::DataAttributeAttr dataAttr) {
     DenseGlobalBuilder globalBuilder;
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::Expr<Fortran::evaluate::SomeLogical> &
                     x) { globalBuilder.tryConvertingToAttributes(builder, x); },
@@ -164,7 +164,7 @@ class DenseGlobalBuilder {
   template <typename SomeCat>
   void tryConvertingToAttributes(fir::FirOpBuilder &builder,
                                  const Fortran::evaluate::Expr<SomeCat> &expr) {
-    std::visit(
+    Fortran::common::visit(
         [&](const auto &x) {
           using TR = Fortran::evaluate::ResultType<decltype(x)>;
           if (const auto *constant =
@@ -796,7 +796,7 @@ static fir::ExtendedValue
 genConstantValue(Fortran::lower::AbstractConverter &converter,
                  mlir::Location loc,
                  const Fortran::lower::SomeExpr &constantExpr) {
-  return std::visit(
+  return Fortran::common::visit(
       [&](const auto &x) -> fir::ExtendedValue {
         using T = std::decay_t<decltype(x)>;
         if constexpr (Fortran::common::HasMember<
@@ -805,7 +805,7 @@ genConstantValue(Fortran::lower::AbstractConverter &converter,
                         Fortran::common::TypeCategory::Derived) {
             return genConstantValue(converter, loc, x);
           } else {
-            return std::visit(
+            return Fortran::common::visit(
                 [&](const auto &preciseKind) {
                   return genConstantValue(converter, loc, preciseKind);
                 },
diff --git a/flang/lib/Lower/ConvertExpr.cpp b/flang/lib/Lower/ConvertExpr.cpp
index 9567685aa3d2e..9937e9d159886 100644
--- a/flang/lib/Lower/ConvertExpr.cpp
+++ b/flang/lib/Lower/ConvertExpr.cpp
@@ -398,8 +398,8 @@ static bool isParenthesizedVariable(const Fortran::evaluate::Expr<T> &expr) {
       return Fortran::evaluate::IsVariable(parentheses->left());
     return false;
   } else {
-    return std::visit([&](const auto &x) { return isParenthesizedVariable(x); },
-                      expr.u);
+    return Fortran::common::visit(
+        [&](const auto &x) { return isParenthesizedVariable(x); }, expr.u);
   }
 }
 
@@ -646,7 +646,7 @@ isOptimizableTranspose(Fortran::evaluate::Expr<T> expr,
   if (!isTransposeOptEnabled(converter))
     return false;
 
-  return std::visit(
+  return Fortran::common::visit(
       [&](const auto &e) { return isOptimizableTranspose(e, converter); },
       expr.u);
 }
@@ -696,7 +696,7 @@ class ScalarExprLowering {
     //    - result of NULL() or NULL(MOLD) intrinsic.
     //    NULL() requires some context to be lowered, so it is not handled
     //    here and must be lowered according to the context where it appears.
-    ExtValue exv = std::visit(
+    ExtValue exv = Fortran::common::visit(
         [&](const auto &x) { return genMutableBoxValueImpl(x); }, expr.u);
     const fir::MutableBoxValue *mutableBox =
         exv.getBoxOf<fir::MutableBoxValue>();
@@ -737,7 +737,7 @@ class ScalarExprLowering {
   template <typename T>
   ExtValue
   genMutableBoxValueImpl(const Fortran::evaluate::Designator<T> &designator) {
-    return std::visit(
+    return Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::SymbolRef &sym) -> ExtValue {
               return converter.getSymbolExtendedValue(*sym, &symMap);
@@ -754,8 +754,8 @@ class ScalarExprLowering {
 
   template <typename T>
   ExtValue genMutableBoxValueImpl(const Fortran::evaluate::Expr<T> &expr) {
-    return std::visit([&](const auto &x) { return genMutableBoxValueImpl(x); },
-                      expr.u);
+    return Fortran::common::visit(
+        [&](const auto &x) { return genMutableBoxValueImpl(x); }, expr.u);
   }
 
   mlir::Location getLoc() { return location; }
@@ -1222,7 +1222,8 @@ class ScalarExprLowering {
 
   ExtValue
   genval(const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &op) {
-    return std::visit([&](const auto &x) { return genval(x); }, op.u);
+    return Fortran::common::visit([&](const auto &x) { return genval(x); },
+                                  op.u);
   }
 
   template <Fortran::common::TypeCategory TC1, int KIND,
@@ -1341,7 +1342,7 @@ class ScalarExprLowering {
   /// Reference to a substring.
   ExtValue gen(const Fortran::evaluate::Substring &s) {
     // Get base string
-    auto baseString = std::visit(
+    auto baseString = Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::DataRef &x) { return gen(x); },
             [&](const Fortran::evaluate::StaticDataObject::Pointer &p)
@@ -1400,10 +1401,12 @@ class ScalarExprLowering {
   }
 
   ExtValue gen(const Fortran::evaluate::DataRef &dref) {
-    return std::visit([&](const auto &x) { return gen(x); }, dref.u);
+    return Fortran::common::visit([&](const auto &x) { return gen(x); },
+                                  dref.u);
   }
   ExtValue genval(const Fortran::evaluate::DataRef &dref) {
-    return std::visit([&](const auto &x) { return genval(x); }, dref.u);
+    return Fortran::common::visit([&](const auto &x) { return genval(x); },
+                                  dref.u);
   }
 
   // Helper function to turn the Component structure into a list of nested
@@ -1418,7 +1421,7 @@ class ScalarExprLowering {
                     std::list<const Fortran::evaluate::Component *> &list) {
     if (!getLastSym(cmpt).test(Fortran::semantics::Symbol::Flag::ParentComp))
       list.push_front(&cmpt);
-    return std::visit(
+    return Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::Component &x) {
               if (Fortran::semantics::IsAllocatableOrPointer(getLastSym(x)))
@@ -1713,11 +1716,12 @@ class ScalarExprLowering {
 
   template <typename A>
   ExtValue gen(const Fortran::evaluate::Designator<A> &des) {
-    return std::visit([&](const auto &x) { return gen(x); }, des.u);
+    return Fortran::common::visit([&](const auto &x) { return gen(x); }, des.u);
   }
   template <typename A>
   ExtValue genval(const Fortran::evaluate::Designator<A> &des) {
-    return std::visit([&](const auto &x) { return genval(x); }, des.u);
+    return Fortran::common::visit([&](const auto &x) { return genval(x); },
+                                  des.u);
   }
 
   mlir::Type genType(const Fortran::evaluate::DynamicType &dt) {
@@ -2900,8 +2904,8 @@ class ScalarExprLowering {
   }
   template <typename T>
   bool isTransformationalRef(Fortran::evaluate::Expr<T> expr) {
-    return std::visit([&](const auto &e) { return isTransformationalRef(e); },
-                      expr.u);
+    return Fortran::common::visit(
+        [&](const auto &e) { return isTransformationalRef(e); }, expr.u);
   }
 
   template <typename A>
@@ -2914,11 +2918,13 @@ class ScalarExprLowering {
   /// value, so it may be possible to avoid making a temporary.
   template <typename A>
   ExtValue asArrayArg(const Fortran::evaluate::Expr<A> &x) {
-    return std::visit([&](const auto &e) { return asArrayArg(e, x); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &e) { return asArrayArg(e, x); }, x.u);
   }
   template <typename A, typename B>
   ExtValue asArrayArg(const Fortran::evaluate::Expr<A> &x, const B &y) {
-    return std::visit([&](const auto &e) { return asArrayArg(e, y); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &e) { return asArrayArg(e, y); }, x.u);
   }
   template <typename A, typename B>
   ExtValue asArrayArg(const Fortran::evaluate::Designator<A> &, const B &x) {
@@ -2956,7 +2962,8 @@ class ScalarExprLowering {
     if (isScalar(x) ||
         Fortran::evaluate::UnwrapWholeSymbolOrComponentDataRef(x) ||
         (isTransformationalRef(x) && !isOptimizableTranspose(x, converter)))
-      return std::visit([&](const auto &e) { return genref(e); }, x.u);
+      return Fortran::common::visit([&](const auto &e) { return genref(e); },
+                                    x.u);
     if (useBoxArg)
       return asArrayArg(x);
     return asArray(x);
@@ -2967,7 +2974,8 @@ class ScalarExprLowering {
       return val;
     if (isScalar(x) || Fortran::evaluate::UnwrapWholeSymbolDataRef(x) ||
         inInitializer)
-      return std::visit([&](const auto &e) { return genval(e); }, x.u);
+      return Fortran::common::visit([&](const auto &e) { return genval(e); },
+                                    x.u);
     return asArray(x);
   }
 
@@ -2976,7 +2984,8 @@ class ScalarExprLowering {
                       Fortran::common::TypeCategory::Logical, KIND>> &exp) {
     if (mlir::Value val = getIfOverridenExpr(exp))
       return val;
-    return std::visit([&](const auto &e) { return genval(e); }, exp.u);
+    return Fortran::common::visit([&](const auto &e) { return genval(e); },
+                                  exp.u);
   }
 
   using RefSet =
@@ -3462,7 +3471,7 @@ class ArrayExprLowering {
 
   ExtValue lowerBoxedArrayExpr(const Fortran::lower::SomeExpr &exp) {
     PushSemantics(ConstituentSemantics::BoxValue);
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &e) {
           auto f = genarr(e);
           ExtValue exv = f(IterationSpace{});
@@ -3824,28 +3833,29 @@ class ArrayExprLowering {
         fir::factory::getExtents(loc, builder, exv);
     mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
     for (auto ss : llvm::enumerate(x.subscript())) {
-      std::visit(Fortran::common::visitors{
-                     [&](const Fortran::evaluate::Triplet &trip) {
-                       // For a subscript of triple notation, we compute the
-                       // range of this dimension of the iteration space.
-                       auto lo = [&]() {
-                         if (auto optLo = trip.lower())
-                           return fir::getBase(asScalar(*optLo));
-                         return getLBound(exv, ss.index(), one);
-                       }();
-                       auto hi = [&]() {
-                         if (auto optHi = trip.upper())
-                           return fir::getBase(asScalar(*optHi));
-                         return getUBound(exv, ss.index(), one);
-                       }();
-                       auto step = builder.createConvert(
-                           loc, idxTy, fir::getBase(asScalar(trip.stride())));
-                       auto extent = builder.genExtentFromTriplet(loc, lo, hi,
-                                                                  step, idxTy);
-                       destShape.push_back(extent);
-                     },
-                     [&](auto) {}},
-                 ss.value().u);
+      Fortran::common::visit(
+          Fortran::common::visitors{
+              [&](const Fortran::evaluate::Triplet &trip) {
+                // For a subscript of triple notation, we compute the
+                // range of this dimension of the iteration space.
+                auto lo = [&]() {
+                  if (auto optLo = trip.lower())
+                    return fir::getBase(asScalar(*optLo));
+                  return getLBound(exv, ss.index(), one);
+                }();
+                auto hi = [&]() {
+                  if (auto optHi = trip.upper())
+                    return fir::getBase(asScalar(*optHi));
+                  return getUBound(exv, ss.index(), one);
+                }();
+                auto step = builder.createConvert(
+                    loc, idxTy, fir::getBase(asScalar(trip.stride())));
+                auto extent =
+                    builder.genExtentFromTriplet(loc, lo, hi, step, idxTy);
+                destShape.push_back(extent);
+              },
+              [&](auto) {}},
+          ss.value().u);
     }
     return true;
   }
@@ -3855,8 +3865,8 @@ class ArrayExprLowering {
     return genShapeFromDataRef(x.GetComponent());
   }
   bool genShapeFromDataRef(const Fortran::evaluate::DataRef &x) {
-    return std::visit([&](const auto &v) { return genShapeFromDataRef(v); },
-                      x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return genShapeFromDataRef(v); }, x.u);
   }
 
   /// When in an explicit space, the ranked component must be evaluated to
@@ -3890,7 +3900,7 @@ class ArrayExprLowering {
       TODO(getLoc(),
            "polymorphic array expression lowering with vector subscript");
 
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &e) { return lowerArrayExpression(genarr(e), resTy); },
         exp.u);
   }
@@ -5012,10 +5022,12 @@ class ArrayExprLowering {
     LLVM_DEBUG(Fortran::lower::DumpEvaluateExpr::dump(llvm::dbgs(), x));
     if (isArray(x) || (explicitSpaceIsActive() && isLeftHandSide()) ||
         isElementalProcWithArrayArgs(x))
-      return std::visit([&](const auto &e) { return genarr(e); }, x.u);
+      return Fortran::common::visit([&](const auto &e) { return genarr(e); },
+                                    x.u);
     if (explicitSpaceIsActive()) {
       assert(!isArray(x) && !isLeftHandSide());
-      auto cc = std::visit([&](const auto &e) { return genarr(e); }, x.u);
+      auto cc =
+          Fortran::common::visit([&](const auto &e) { return genarr(e); }, x.u);
       auto result = cc(IterationSpace{});
       return [=](IterSpace) { return result; };
     }
@@ -5289,7 +5301,8 @@ class ArrayExprLowering {
   static Fortran::lower::SomeExpr
   ignoreEvConvert(const Fortran::evaluate::Expr<Fortran::evaluate::Type<
                       Fortran::common::TypeCategory::Integer, 8>> &x) {
-    return std::visit([&](const auto &v) { return ignoreEvConvert(v); }, x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return ignoreEvConvert(v); }, x.u);
   }
   template <Fortran::common::TypeCategory FROM>
   static Fortran::lower::SomeExpr ignoreEvConvert(
@@ -5310,8 +5323,8 @@ class ArrayExprLowering {
   template <typename A>
   static const Fortran::semantics::Symbol *
   extractSubscriptSymbol(const Fortran::evaluate::Expr<A> &x) {
-    return std::visit([&](const auto &v) { return extractSubscriptSymbol(v); },
-                      x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return extractSubscriptSymbol(v); }, x.u);
   }
   template <typename A>
   static const Fortran::semantics::Symbol *
@@ -5420,7 +5433,7 @@ class ArrayExprLowering {
     std::size_t shapeIndex = 0;
     for (auto sub : llvm::enumerate(x.subscript())) {
       const std::size_t subsIndex = sub.index();
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::evaluate::Triplet &t) {
                 mlir::Value lowerBound;
@@ -6034,8 +6047,8 @@ class ArrayExprLowering {
   /// Substrings (see 9.4.1)
   CC genarr(const Fortran::evaluate::Substring &x, ComponentPath &components) {
     components.substring = &x;
-    return std::visit([&](const auto &v) { return genarr(v, components); },
-                      x.parent());
+    return Fortran::common::visit(
+        [&](const auto &v) { return genarr(v, components); }, x.parent());
   }
 
   template <typename T>
@@ -6333,7 +6346,7 @@ class ArrayExprLowering {
     stmtCtx.pushScope();
     std::optional<mlir::Value> charLen;
     for (const Fortran::evaluate::ArrayConstructorValue<A> &acv : x.values()) {
-      auto [exv, copyNeeded] = std::visit(
+      auto [exv, copyNeeded] = Fortran::common::visit(
           [&](const auto &v) {
             return genArrayCtorInitializer(v, resTy, mem, buffPos, buffSize,
                                            stmtCtx);
@@ -6417,7 +6430,7 @@ class ArrayExprLowering {
     // Populate the buffer with the elements, growing as necessary.
     std::optional<mlir::Value> charLen;
     for (const auto &expr : x) {
-      auto [exv, copyNeeded] = std::visit(
+      auto [exv, copyNeeded] = Fortran::common::visit(
           [&](const auto &e) {
             return genArrayCtorInitializer(e, resTy, mem, buffPos, buffSize,
                                            stmtCtx);
@@ -6582,22 +6595,24 @@ class ArrayExprLowering {
   }
   CC genarr(
       const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &r) {
-    return std::visit([&](const auto &x) { return genarr(x); }, r.u);
+    return Fortran::common::visit([&](const auto &x) { return genarr(x); },
+                                  r.u);
   }
 
   template <typename A>
   CC genarr(const Fortran::evaluate::Designator<A> &des) {
     ComponentPath components(des.Rank() > 0);
-    return std::visit([&](const auto &x) { return genarr(x, components); },
-                      des.u);
+    return Fortran::common::visit(
+        [&](const auto &x) { return genarr(x, components); }, des.u);
   }
 
   /// Is the path component rank > 0?
   static bool ranked(const PathComponent &x) {
-    return std::visit(Fortran::common::visitors{
-                          [](const ImplicitSubscripts &) { return false; },
-                          [](const auto *v) { return v->Rank() > 0; }},
-                      x);
+    return Fortran::common::visit(
+        Fortran::common::visitors{
+            [](const ImplicitSubscripts &) { return false; },
+            [](const auto *v) { return v->Rank() > 0; }},
+        x);
   }
 
   void extendComponent(Fortran::lower::ComponentPath &component,
@@ -6653,7 +6668,7 @@ class ArrayExprLowering {
                       : nextPathSemantics());
     unsigned index = 0;
     for (const auto &v : llvm::reverse(revPath)) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const ImplicitSubscripts &) {
                 prefix = false;
@@ -6678,7 +6693,7 @@ class ArrayExprLowering {
                   unsigned ssIndex = 0u;
                   llvm::SmallVector<mlir::Value> componentsToAdd;
                   for (const auto &ss : x->subscript()) {
-                    std::visit(
+                    Fortran::common::visit(
                         Fortran::common::visitors{
                             [&](const Fortran::evaluate::
                                     IndirectSubscriptIntegerExpr &ie) {
@@ -7099,8 +7114,8 @@ class ArrayExprLowering {
   }
 
   CC genarr(const Fortran::evaluate::DataRef &x, ComponentPath &components) {
-    return std::visit([&](const auto &v) { return genarr(v, components); },
-                      x.u);
+    return Fortran::common::visit(
+        [&](const auto &v) { return genarr(v, components); }, x.u);
   }
 
   bool pathIsEmpty(const ComponentPath &components) {
@@ -7575,13 +7590,13 @@ void Fortran::lower::createArrayLoads(
   };
   if (esp.lhsBases[counter]) {
     auto &base = *esp.lhsBases[counter];
-    auto load = std::visit(genLoad, base);
+    auto load = Fortran::common::visit(genLoad, base);
     esp.initialArgs.push_back(load);
     esp.resetInnerArgs();
     esp.bindLoad(base, load);
   }
   for (const auto &base : esp.rhsBases[counter])
-    esp.bindLoad(base, std::visit(genLoad, base));
+    esp.bindLoad(base, Fortran::common::visit(genLoad, base));
 }
 
 void Fortran::lower::createArrayMergeStores(
diff --git a/flang/lib/Lower/ConvertExprToHLFIR.cpp b/flang/lib/Lower/ConvertExprToHLFIR.cpp
index 9035856eabfe7..1933f38f735b5 100644
--- a/flang/lib/Lower/ConvertExprToHLFIR.cpp
+++ b/flang/lib/Lower/ConvertExprToHLFIR.cpp
@@ -75,7 +75,7 @@ class HlfirDesignatorBuilder {
   hlfir::EntityWithAttributes
   gen(const CharacterDesignators &designatorVariant,
       bool vectorSubscriptDesignatorToValue = true) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &x) -> hlfir::EntityWithAttributes {
           return genLeafPartRef(x, vectorSubscriptDesignatorToValue);
         },
@@ -88,7 +88,7 @@ class HlfirDesignatorBuilder {
   hlfir::EntityWithAttributes
   gen(const RealDesignators &designatorVariant,
       bool vectorSubscriptDesignatorToValue = true) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &x) -> hlfir::EntityWithAttributes {
           return genLeafPartRef(x, vectorSubscriptDesignatorToValue);
         },
@@ -101,7 +101,7 @@ class HlfirDesignatorBuilder {
   hlfir::EntityWithAttributes
   gen(const OtherDesignators &designatorVariant,
       bool vectorSubscriptDesignatorToValue = true) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](const auto &x) -> hlfir::EntityWithAttributes {
           return genLeafPartRef(x, vectorSubscriptDesignatorToValue);
         },
@@ -169,7 +169,7 @@ class HlfirDesignatorBuilder {
 
   fir::FortranVariableOpInterface
   gen(const Fortran::evaluate::DataRef &dataRef) {
-    return std::visit(
+    return Fortran::common::visit(
         Fortran::common::visitors{[&](const auto &x) { return gen(x); }},
         dataRef.u);
   }
@@ -364,7 +364,7 @@ class HlfirDesignatorBuilder {
   fir::FortranVariableOpInterface
   gen(const Fortran::evaluate::Substring &substring) {
     PartInfo partInfo;
-    mlir::Type baseStringType = std::visit(
+    mlir::Type baseStringType = Fortran::common::visit(
         [&](const auto &x) { return visit(x, partInfo); }, substring.parent());
     assert(partInfo.typeParams.size() == 1 && "expect base string length");
     // Compute the substring lower and upper bound.
@@ -436,8 +436,8 @@ class HlfirDesignatorBuilder {
 
   mlir::Type visit(const Fortran::evaluate::DataRef &dataRef,
                    PartInfo &partInfo) {
-    return std::visit([&](const auto &x) { return visit(x, partInfo); },
-                      dataRef.u);
+    return Fortran::common::visit(
+        [&](const auto &x) { return visit(x, partInfo); }, dataRef.u);
   }
 
   mlir::Type
@@ -892,7 +892,7 @@ hlfir::EntityWithAttributes HlfirDesignatorBuilder::genDesignatorExpr(
     bool vectorSubscriptDesignatorToValue) {
   // Expr<SomeType> plumbing to unwrap Designator<T> and call
   // gen(Designator<T>.u).
-  return std::visit(
+  return Fortran::common::visit(
       [&](const auto &x) -> hlfir::EntityWithAttributes {
         using T = std::decay_t<decltype(x)>;
         if constexpr (Fortran::common::HasMember<
@@ -904,7 +904,7 @@ hlfir::EntityWithAttributes HlfirDesignatorBuilder::genDesignatorExpr(
                            .u,
                        vectorSubscriptDesignatorToValue);
           } else {
-            return std::visit(
+            return Fortran::common::visit(
                 [&](const auto &preciseKind) {
                   using TK =
                       typename std::decay_t<decltype(preciseKind)>::Result;
@@ -1426,7 +1426,8 @@ class HlfirBuilder {
           return hlfir::EntityWithAttributes{match->second};
       }
     }
-    return std::visit([&](const auto &x) { return gen(x); }, expr.u);
+    return Fortran::common::visit([&](const auto &x) { return gen(x); },
+                                  expr.u);
   }
 
 private:
@@ -1594,7 +1595,7 @@ class HlfirBuilder {
 
   hlfir::EntityWithAttributes
   gen(const Fortran::evaluate::Relational<Fortran::evaluate::SomeType> &op) {
-    return std::visit([&](const auto &x) { return gen(x); }, op.u);
+    return Fortran::common::visit([&](const auto &x) { return gen(x); }, op.u);
   }
 
   hlfir::EntityWithAttributes gen(const Fortran::evaluate::TypeParamInquiry &) {
diff --git a/flang/lib/Lower/ConvertType.cpp b/flang/lib/Lower/ConvertType.cpp
index e6557d7f0b767..f64f6c93541a3 100644
--- a/flang/lib/Lower/ConvertType.cpp
+++ b/flang/lib/Lower/ConvertType.cpp
@@ -212,7 +212,7 @@ struct TypeBuilderImpl {
   }
 
   mlir::Type genTypelessExprType(const Fortran::lower::SomeExpr &expr) {
-    return std::visit(
+    return Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::evaluate::BOZLiteralConstant &) -> mlir::Type {
               return mlir::NoneType::get(context);
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index 48b090f6d2dbe..f0af5f982c14f 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -836,7 +836,7 @@ struct PeelConvert {
   static Fortran::semantics::MaybeExpr visit_with_category(
       const Fortran::evaluate::Expr<Fortran::evaluate::Type<Category, Kind>>
           &expr) {
-    return std::visit(
+    return Fortran::common::visit(
         [](auto &&s) { return visit_with_category<Category, Kind>(s); },
         expr.u);
   }
@@ -859,12 +859,12 @@ struct PeelConvert {
   static Fortran::semantics::MaybeExpr
   visit(const Fortran::evaluate::Expr<Fortran::evaluate::SomeKind<Category>>
             &expr) {
-    return std::visit([](auto &&s) { return visit_with_category<Category>(s); },
-                      expr.u);
+    return Fortran::common::visit(
+        [](auto &&s) { return visit_with_category<Category>(s); }, expr.u);
   }
   static Fortran::semantics::MaybeExpr
   visit(const Fortran::evaluate::Expr<Fortran::evaluate::SomeType> &expr) {
-    return std::visit([](auto &&s) { return visit(s); }, expr.u);
+    return Fortran::common::visit([](auto &&s) { return visit(s); }, expr.u);
   }
   template <typename T> //
   static Fortran::semantics::MaybeExpr visit(const T &) {
diff --git a/flang/lib/Lower/IO.cpp b/flang/lib/Lower/IO.cpp
index 97ef991cb3990..9e98b230b676f 100644
--- a/flang/lib/Lower/IO.cpp
+++ b/flang/lib/Lower/IO.cpp
@@ -1388,7 +1388,7 @@ static void threadSpecs(Fortran::lower::AbstractConverter &converter,
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   for (const auto &spec : specList) {
     makeNextConditionalOn(builder, loc, checkResult, ok);
-    ok = std::visit(
+    ok = Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::parser::IoControlSpec::Size &x) -> mlir::Value {
               // Size must be queried after the related READ runtime calls, not
@@ -1425,7 +1425,7 @@ ConditionSpecInfo lowerErrorSpec(Fortran::lower::AbstractConverter &converter,
   ConditionSpecInfo csi;
   const Fortran::lower::SomeExpr *ioMsgExpr = nullptr;
   for (const auto &spec : specList) {
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::parser::StatVariable &var) {
               csi.ioStatExpr = Fortran::semantics::GetExpr(var);
@@ -2397,7 +2397,7 @@ lowerIdExpr(Fortran::lower::AbstractConverter &converter, mlir::Location loc,
             const std::list<Fortran::parser::InquireSpec> &ispecs,
             Fortran::lower::StatementContext &stmtCtx) {
   for (const Fortran::parser::InquireSpec &spec : ispecs)
-    if (mlir::Value v = std::visit(
+    if (mlir::Value v = Fortran::common::visit(
             Fortran::common::visitors{
                 [&](const Fortran::parser::IdExpr &idExpr) {
                   return fir::getBase(converter.genExprValue(
@@ -2419,11 +2419,11 @@ static void threadInquire(Fortran::lower::AbstractConverter &converter,
   mlir::Value idExpr = lowerIdExpr(converter, loc, ispecs, stmtCtx);
   for (const Fortran::parser::InquireSpec &spec : ispecs) {
     makeNextConditionalOn(builder, loc, checkResult, ok);
-    ok = std::visit(Fortran::common::visitors{[&](const auto &x) {
-                      return genInquireSpec(converter, loc, cookie, idExpr, x,
-                                            stmtCtx);
-                    }},
-                    spec.u);
+    ok = Fortran::common::visit(Fortran::common::visitors{[&](const auto &x) {
+                                  return genInquireSpec(converter, loc, cookie,
+                                                        idExpr, x, stmtCtx);
+                                }},
+                                spec.u);
   }
 }
 
diff --git a/flang/lib/Lower/IterationSpace.cpp b/flang/lib/Lower/IterationSpace.cpp
index 6bf310b5cfb76..9303536403837 100644
--- a/flang/lib/Lower/IterationSpace.cpp
+++ b/flang/lib/Lower/IterationSpace.cpp
@@ -21,14 +21,14 @@
 
 unsigned Fortran::lower::getHashValue(
     const Fortran::lower::ExplicitIterSpace::ArrayBases &x) {
-  return std::visit(
+  return Fortran::common::visit(
       [&](const auto *p) { return HashEvaluateExpr::getHashValue(*p); }, x);
 }
 
 bool Fortran::lower::isEqual(
     const Fortran::lower::ExplicitIterSpace::ArrayBases &x,
     const Fortran::lower::ExplicitIterSpace::ArrayBases &y) {
-  return std::visit(
+  return Fortran::common::visit(
       Fortran::common::visitors{
           // Fortran::semantics::Symbol * are the exception here. These pointers
           // have identity; if two Symbol * values are the same (different) then
@@ -169,7 +169,7 @@ class ArrayBaseFinder {
   }
   template <typename... A>
   RT find(const std::variant<A...> &u) {
-    return std::visit([&](const auto &v) { return find(v); }, u);
+    return Fortran::common::visit([&](const auto &v) { return find(v); }, u);
   }
   template <typename A>
   RT find(const std::vector<A> &x) {
@@ -361,22 +361,23 @@ llvm::raw_ostream &
 Fortran::lower::operator<<(llvm::raw_ostream &s,
                            const Fortran::lower::ExplicitIterSpace &e) {
   auto dump = [&](const auto &u) {
-    std::visit(Fortran::common::visitors{
-                   [&](const Fortran::semantics::Symbol *y) {
-                     s << "  " << *y << '\n';
-                   },
-                   [&](const Fortran::evaluate::ArrayRef *y) {
-                     s << "  ";
-                     if (y->base().IsSymbol())
-                       s << y->base().GetFirstSymbol();
-                     else
-                       s << y->base().GetComponent().GetLastSymbol();
-                     s << '\n';
-                   },
-                   [&](const Fortran::evaluate::Component *y) {
-                     s << "  " << y->GetLastSymbol() << '\n';
-                   }},
-               u);
+    Fortran::common::visit(
+        Fortran::common::visitors{
+            [&](const Fortran::semantics::Symbol *y) {
+              s << "  " << *y << '\n';
+            },
+            [&](const Fortran::evaluate::ArrayRef *y) {
+              s << "  ";
+              if (y->base().IsSymbol())
+                s << y->base().GetFirstSymbol();
+              else
+                s << y->base().GetComponent().GetLastSymbol();
+              s << '\n';
+            },
+            [&](const Fortran::evaluate::Component *y) {
+              s << "  " << y->GetLastSymbol() << '\n';
+            }},
+        u);
   };
   s << "LHS bases:\n";
   for (const std::optional<Fortran::lower::ExplicitIterSpace::ArrayBases> &u :
diff --git a/flang/lib/Lower/Mangler.cpp b/flang/lib/Lower/Mangler.cpp
index 9a33be318a27d..878ba6dea49b6 100644
--- a/flang/lib/Lower/Mangler.cpp
+++ b/flang/lib/Lower/Mangler.cpp
@@ -110,7 +110,7 @@ std::string Fortran::lower::mangle::mangleName(
     return fir::NameUniquer::doVariable(modules, procs, blockId, symbolName);
   };
 
-  return std::visit(
+  return Fortran::common::visit(
       Fortran::common::visitors{
           [&](const Fortran::semantics::MainProgramDetails &) {
             return fir::NameUniquer::doProgramEntry().str();
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 4f5da8fb70eba..166fa686cd883 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -46,14 +46,15 @@ static mlir::Location
 genOperandLocation(Fortran::lower::AbstractConverter &converter,
                    const Fortran::parser::AccObject &accObject) {
   mlir::Location loc = converter.genUnknownLocation();
-  std::visit(Fortran::common::visitors{
-                 [&](const Fortran::parser::Designator &designator) {
-                   loc = converter.genLocation(designator.source);
-                 },
-                 [&](const Fortran::parser::Name &name) {
-                   loc = converter.genLocation(name.source);
-                 }},
-             accObject.u);
+  Fortran::common::visit(
+      Fortran::common::visitors{
+          [&](const Fortran::parser::Designator &designator) {
+            loc = converter.genLocation(designator.source);
+          },
+          [&](const Fortran::parser::Name &name) {
+            loc = converter.genLocation(name.source);
+          }},
+      accObject.u);
   return loc;
 }
 
@@ -297,8 +298,8 @@ genDataOperandOperations(const Fortran::parser::AccObjectList &objectList,
     std::stringstream asFortran;
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
-    Fortran::semantics::MaybeExpr designator =
-        std::visit([&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+    Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+        [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
     Fortran::lower::AddrAndBoundsInfo info =
         Fortran::lower::gatherDataOperandAddrAndBounds<
             mlir::acc::DataBoundsOp, mlir::acc::DataBoundsType>(
@@ -335,8 +336,8 @@ static void genDeclareDataOperandOperations(
     std::stringstream asFortran;
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
-    Fortran::semantics::MaybeExpr designator =
-        std::visit([&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+    Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+        [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
     Fortran::lower::AddrAndBoundsInfo info =
         Fortran::lower::gatherDataOperandAddrAndBounds<
             mlir::acc::DataBoundsOp, mlir::acc::DataBoundsType>(
@@ -790,8 +791,8 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList,
     std::stringstream asFortran;
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
-    Fortran::semantics::MaybeExpr designator =
-        std::visit([&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+    Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+        [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
     Fortran::lower::AddrAndBoundsInfo info =
         Fortran::lower::gatherDataOperandAddrAndBounds<
             mlir::acc::DataBoundsOp, mlir::acc::DataBoundsType>(
@@ -1364,8 +1365,8 @@ genReductions(const Fortran::parser::AccObjectListWithReduction &objectList,
     std::stringstream asFortran;
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
     Fortran::semantics::Symbol &symbol = getSymbolFromAccObject(accObject);
-    Fortran::semantics::MaybeExpr designator =
-        std::visit([&](auto &&s) { return ea.Analyze(s); }, accObject.u);
+    Fortran::semantics::MaybeExpr designator = Fortran::common::visit(
+        [&](auto &&s) { return ea.Analyze(s); }, accObject.u);
     Fortran::lower::AddrAndBoundsInfo info =
         Fortran::lower::gatherDataOperandAddrAndBounds<
             mlir::acc::DataBoundsOp, mlir::acc::DataBoundsType>(
@@ -3414,7 +3415,7 @@ static void genGlobalCtors(Fortran::lower::AbstractConverter &converter,
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
   for (const auto &accObject : accObjectList.v) {
     mlir::Location operandLocation = genOperandLocation(converter, accObject);
-    std::visit(
+    Fortran::common::visit(
         Fortran::common::visitors{
             [&](const Fortran::parser::Designator &designator) {
               if (const auto *name =
@@ -3993,7 +3994,7 @@ genACC(Fortran::lower::AbstractConverter &converter,
        const Fortran::parser::OpenACCAtomicConstruct &atomicConstruct) {
 
   mlir::Location loc = converter.genLocation(atomicConstruct.source);
-  std::visit(
+  Fortran::common::visit(
       Fortran::common::visitors{
           [&](const Fortran::parser::AccAtomicRead &atomicRead) {
             Fortran::lower::genOmpAccAtomicRead<Fortran::parser::AccAtomicRead,
@@ -4061,7 +4062,7 @@ mlir::Value Fortran::lower::genOpenACCConstruct(
     const Fortran::parser::OpenACCConstruct &accConstruct) {
 
   mlir::Value exitCond;
-  std::visit(
+  Fortran::common::visit(
       common::visitors{
           [&](const Fortran::parser::OpenACCBlockConstruct &blockConstruct) {
             genACC(converter, semanticsContext, eval, blockConstruct);
@@ -4101,7 +4102,7 @@ void Fortran::lower::genOpenACCDeclarativeConstruct(
     const Fortran::parser::OpenACCDeclarativeConstruct &accDeclConstruct,
     Fortran::lower::AccRoutineInfoMappingList &accRoutineInfos) {
 
-  std::visit(
+  Fortran::common::visit(
       common::visitors{
           [&](const Fortran::parser::OpenACCStandaloneDeclarativeConstruct
                   &standaloneDeclarativeConstruct) {
diff --git a/flang/lib/Lower/OpenMP/Clauses.cpp b/flang/lib/Lower/OpenMP/Clauses.cpp
index 6bae62cd85860..c355b63deff8a 100644
--- a/flang/lib/Lower/OpenMP/Clauses.cpp
+++ b/flang/lib/Lower/OpenMP/Clauses.cpp
@@ -38,8 +38,8 @@ llvm::omp::Clause getClauseIdForClass(C &&) {
 } // namespace detail
 
 static llvm::omp::Clause getClauseId(const Fortran::parser::OmpClause &clause) {
-  return std::visit([](auto &&s) { return detail::getClauseIdForClass(s); },
-                    clause.u);
+  return Fortran::common::visit(
+      [](auto &&s) { return detail::getClauseIdForClass(s); }, clause.u);
 }
 
 namespace Fortran::lower::omp {
@@ -83,7 +83,7 @@ struct SymbolAndDesignatorExtractor {
 
   template <typename T>
   static SymbolWithDesignator visit(const evaluate::Expr<T> &e) {
-    return std::visit([](auto &&s) { return visit(s); }, e.u);
+    return Fortran::common::visit([](auto &&s) { return visit(s); }, e.u);
   }
 
   static void verify(const SymbolWithDesignator &sd) {
@@ -112,7 +112,7 @@ struct SymbolAndDesignatorExtractor {
 SymbolWithDesignator getSymbolAndDesignator(const MaybeExpr &expr) {
   if (!expr)
     return SymbolWithDesignator{};
-  return std::visit(
+  return Fortran::common::visit(
       [](auto &&s) { return SymbolAndDesignatorExtractor::visit(s); }, expr->u);
 }
 
@@ -278,7 +278,7 @@ DefinedOperator makeDefinedOperator(const parser::DefinedOperator &inp,
       // clang-format on
   );
 
-  return std::visit(
+  return Fortran::common::visit(
       common::visitors{
           [&](const parser::DefinedOpName &s) {
             return DefinedOperator{
@@ -294,7 +294,7 @@ DefinedOperator makeDefinedOperator(const parser::DefinedOperator &inp,
 ProcedureDesignator
 makeProcedureDesignator(const parser::ProcedureDesignator &inp,
                         semantics::SemanticsContext &semaCtx) {
-  return ProcedureDesignator{std::visit(
+  return ProcedureDesignator{Fortran::common::visit(
       common::visitors{
           [&](const parser::Name &t) { return makeObject(t, semaCtx); },
           [&](const parser::ProcComponentRef &t) {
@@ -306,7 +306,7 @@ makeProcedureDesignator(const parser::ProcedureDesignator &inp,
 
 ReductionOperator makeReductionOperator(const parser::OmpReductionOperator &inp,
                                         semantics::SemanticsContext &semaCtx) {
-  return std::visit(
+  return Fortran::common::visit(
       common::visitors{
           [&](const parser::DefinedOperator &s) {
             return ReductionOperator{makeDefinedOperator(s, semaCtx)};
@@ -366,7 +366,7 @@ Allocate make(const parser::OmpClause::Allocate &inp,
 
   using Tuple = decltype(Allocate::t);
 
-  return Allocate{std::visit(
+  return Allocate{Fortran::common::visit(
       common::visitors{
           // simple-modifier
           [&](const wrapped::AllocateModifier::Allocator &v) -> Tuple {
@@ -531,7 +531,7 @@ Depend make(const parser::OmpClause::Depend &inp,
       // clang-format on
   );
 
-  return Depend{std::visit( //
+  return Depend{Fortran::common::visit( //
       common::visitors{
           // Doacross
           [&](const wrapped::Source &s) -> Variant {
@@ -793,7 +793,7 @@ Linear make(const parser::OmpClause::Linear &inp,
 
   using Tuple = decltype(Linear::t);
 
-  return Linear{std::visit(
+  return Linear{Fortran::common::visit(
       common::visitors{
           [&](const wrapped::WithModifier &s) -> Tuple {
             return {
@@ -949,7 +949,7 @@ Order make(const parser::OmpClause::Order &inp,
   auto &t1 = std::get<wrapped::Type>(inp.v.t);
 
   auto convert3 = [&](const parser::OmpOrderModifier &s) {
-    return std::visit(
+    return Fortran::common::visit(
         [&](parser::OmpOrderModifier::Kind k) { return convert1(k); }, s.u);
   };
   return Order{
@@ -1212,7 +1212,7 @@ UsesAllocators make(const parser::OmpClause::UsesAllocators &inp,
 
 Clause makeClause(const parser::OmpClause &cls,
                   semantics::SemanticsContext &semaCtx) {
-  return std::visit(
+  return Fortran::common::visit(
       [&](auto &&s) {
         return makeClause(getClauseId(cls), clause::make(s, semaCtx),
                           cls.source);
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index aac22f0faad37..7d75e6f67dc1b 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2199,7 +2199,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPDeclarativeConstruct &ompDeclConstruct) {
-  std::visit(
+  Fortran::common::visit(
       [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); },
       ompDeclConstruct.u);
 }
@@ -2276,7 +2276,7 @@ static void
 genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
        semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval,
        const parser::OpenMPStandaloneConstruct &standaloneConstruct) {
-  std::visit(
+  Fortran::common::visit(
       [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); },
       standaloneConstruct.u);
 }
@@ -2296,7 +2296,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPAtomicConstruct &atomicConstruct) {
-  std::visit(
+  Fortran::common::visit(
       common::visitors{
           [&](const parser::OmpAtomicRead &atomicRead) {
             mlir::Location loc = converter.genLocation(atomicRead.source);
@@ -2487,7 +2487,7 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable,
                    semantics::SemanticsContext &semaCtx,
                    lower::pft::Evaluation &eval,
                    const parser::OpenMPConstruct &ompConstruct) {
-  std::visit(
+  Fortran::common::visit(
       [&](auto &&s) { return genOMP(converter, symTable, semaCtx, eval, s); },
       ompConstruct.u);
 }
@@ -2649,21 +2649,22 @@ void Fortran::lower::gatherOpenMPDeferredDeclareTargets(
     const parser::OpenMPDeclarativeConstruct &ompDecl,
     llvm::SmallVectorImpl<OMPDeferredDeclareTargetInfo>
         &deferredDeclareTarget) {
-  std::visit(common::visitors{
-                 [&](const parser::OpenMPDeclareTargetConstruct &ompReq) {
-                   collectDeferredDeclareTargets(converter, semaCtx, eval,
-                                                 ompReq, deferredDeclareTarget);
-                 },
-                 [&](const auto &) {},
-             },
-             ompDecl.u);
+  Fortran::common::visit(
+      common::visitors{
+          [&](const parser::OpenMPDeclareTargetConstruct &ompReq) {
+            collectDeferredDeclareTargets(converter, semaCtx, eval, ompReq,
+                                          deferredDeclareTarget);
+          },
+          [&](const auto &) {},
+      },
+      ompDecl.u);
 }
 
 bool Fortran::lower::isOpenMPDeviceDeclareTarget(
     lower::AbstractConverter &converter, semantics::SemanticsContext &semaCtx,
     lower::pft::Evaluation &eval,
     const parser::OpenMPDeclarativeConstruct &ompDecl) {
-  return std::visit(
+  return Fortran::common::visit(
       common::visitors{
           [&](const parser::OpenMPDeclareTargetConstruct &ompReq) {
             mlir::omp::DeclareTargetDeviceType targetType =
diff --git a/flang/lib/Lower/OpenMP/Utils.cpp b/flang/lib/Lower/OpenMP/Utils.cpp
index 36d96f37ff36a..8aeef175ad2d2 100644
--- a/flang/lib/Lower/OpenMP/Utils.cpp
+++ b/flang/lib/Lower/OpenMP/Utils.cpp
@@ -325,7 +325,7 @@ void insertChildMapInfoIntoParent(
 
 semantics::Symbol *getOmpObjectSymbol(const parser::OmpObject &ompObject) {
   semantics::Symbol *sym = nullptr;
-  std::visit(
+  Fortran::common::visit(
       common::visitors{
           [&](const parser::Designator &designator) {
             if (auto *arrayEle =
diff --git a/flang/lib/Lower/PFTBuilder.cpp b/flang/lib/Lower/PFTBuilder.cpp
index fc34895d1ec75..5b3d5471925bf 100644
--- a/flang/lib/Lower/PFTBuilder.cpp
+++ b/flang/lib/Lower/PFTBuilder.cpp
@@ -103,7 +103,7 @@ class PFTBuilder {
             stmt.unwrapped, pftParentStack.back(), stmt.position, stmt.label});
         return false;
       } else if constexpr (std::is_same_v<T, parser::ActionStmt>) {
-        return std::visit(
+        return Fortran::common::visit(
             common::visitors{
                 [&](const common::Indirection<parser::CallStmt> &x) {
                   addEvaluation(lower::pft::Evaluation{
@@ -239,7 +239,7 @@ class PFTBuilder {
 
   // Get rid of production wrapper
   bool Pre(const parser::Statement<parser::ForallAssignmentStmt> &statement) {
-    addEvaluation(std::visit(
+    addEvaluation(Fortran::common::visit(
         [&](const auto &x) {
           return lower::pft::Evaluation{x, pftParentStack.back(),
                                         statement.source, statement.label};
@@ -248,7 +248,7 @@ class PFTBuilder {
     return false;
   }
   bool Pre(const parser::WhereBodyConstruct &whereBody) {
-    return std::visit(
+    return Fortran::common::visit(
         common::visitors{
             [&](const parser::Statement<parser::AssignmentStmt> &stmt) {
               // Not caught as other AssignmentStmt because it is not
@@ -469,7 +469,7 @@ class PFTBuilder {
   makeEvaluationAction(const parser::ActionStmt &statement,
                        parser::CharBlock position,
                        std::optional<parser::Label> label) {
-    return std::visit(
+    return Fortran::common::visit(
         common::visitors{
             [&](const auto &x) {
               return lower::pft::Evaluation{
@@ -664,7 +664,7 @@ class PFTBuilder {
     };
     auto analyzeSpecs{[&](const auto &specList) {
       for (const auto &spec : specList) {
-        std::visit(
+        Fortran::common::visit(
             Fortran::common::visitors{
                 [&](const Fortran::parser::Format &format) {
                   analyzeFormatSpec(format);
@@ -1172,26 +1172,27 @@ class PFTDumper {
   void dumpPFT(llvm::raw_ostream &outputStream,
                const lower::pft::Program &pft) {
     for (auto &unit : pft.getUnits()) {
-      std::visit(common::visitors{
-                     [&](const lower::pft::BlockDataUnit &unit) {
-                       outputStream << getNodeIndex(unit) << " ";
-                       outputStream << "BlockData: ";
-                       outputStream << "\nEnd BlockData\n\n";
-                     },
-                     [&](const lower::pft::FunctionLikeUnit &func) {
-                       dumpFunctionLikeUnit(outputStream, func);
-                     },
-                     [&](const lower::pft::ModuleLikeUnit &unit) {
-                       dumpModuleLikeUnit(outputStream, unit);
-                     },
-                     [&](const lower::pft::CompilerDirectiveUnit &unit) {
-                       dumpCompilerDirectiveUnit(outputStream, unit);
-                     },
-                     [&](const lower::pft::OpenACCDirectiveUnit &unit) {
-                       dumpOpenACCDirectiveUnit(outputStream, unit);
-                     },
-                 },
-                 unit);
+      Fortran::common::visit(
+          common::visitors{
+              [&](const lower::pft::BlockDataUnit &unit) {
+                outputStream << getNodeIndex(unit) << " ";
+                outputStream << "BlockData: ";
+                outputStream << "\nEnd BlockData\n\n";
+              },
+              [&](const lower::pft::FunctionLikeUnit &func) {
+                dumpFunctionLikeUnit(outputStream, func);
+              },
+              [&](const lower::pft::ModuleLikeUnit &unit) {
+                dumpModuleLikeUnit(outputStream, unit);
+              },
+              [&](const lower::pft::CompilerDirectiveUnit &unit) {
+                dumpCompilerDirectiveUnit(outputStream, unit);
+              },
+              [&](const lower::pft::OpenACCDirectiveUnit &unit) {
+                dumpOpenACCDirectiveUnit(outputStream, unit);
+              },
+          },
+          unit);
     }
   }
 
diff --git a/flang/lib/Lower/VectorSubscripts.cpp b/flang/lib/Lower/VectorSubscripts.cpp
index d7a311d32d59d..389a89ddcf102 100644
--- a/flang/lib/Lower/VectorSubscripts.cpp
+++ b/flang/lib/Lower/VectorSubscripts.cpp
@@ -55,10 +55,11 @@ class VectorSubscriptBoxBuilder {
     using Designator = Fortran::evaluate::Designator<T>;
     if constexpr (Fortran::common::HasMember<Designator, ExprVariant>) {
       const auto &designator = std::get<Designator>(expr.u);
-      return std::visit([&](const auto &x) { return gen(x); }, designator.u);
+      return Fortran::common::visit([&](const auto &x) { return gen(x); },
+                                    designator.u);
     } else {
-      return std::visit([&](const auto &x) { return genDesignator(x); },
-                        expr.u);
+      return Fortran::common::visit(
+          [&](const auto &x) { return genDesignator(x); }, expr.u);
     }
   }
 
@@ -66,8 +67,8 @@ class VectorSubscriptBoxBuilder {
   // type of X elements.
 
   mlir::Type gen(const Fortran::evaluate::DataRef &dataRef) {
-    return std::visit([&](const auto &ref) -> mlir::Type { return gen(ref); },
-                      dataRef.u);
+    return Fortran::common::visit(
+        [&](const auto &ref) -> mlir::Type { return gen(ref); }, dataRef.u);
   }
 
   mlir::Type gen(const Fortran::evaluate::SymbolRef &symRef) {
@@ -128,7 +129,7 @@ class VectorSubscriptBoxBuilder {
   mlir::Type gen(const Fortran::evaluate::ArrayRef &arrayRef) {
     auto isTripletOrVector =
         [](const Fortran::evaluate::Subscript &subscript) -> bool {
-      return std::visit(
+      return Fortran::common::visit(
           Fortran::common::visitors{
               [](const Fortran::evaluate::IndirectSubscriptIntegerExpr &expr) {
                 return expr.value().Rank() != 0;
@@ -165,7 +166,7 @@ class VectorSubscriptBoxBuilder {
     mlir::Type idxTy = builder.getIndexType();
     mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
     for (const auto &subscript : llvm::enumerate(arrayRef.subscript())) {
-      std::visit(
+      Fortran::common::visit(
           Fortran::common::visitors{
               [&](const Fortran::evaluate::IndirectSubscriptIntegerExpr &expr) {
                 if (expr.value().Rank() == 0) {
@@ -327,24 +328,24 @@ Fortran::lower::VectorSubscriptBox::createSlice(fir::FirOpBuilder &builder,
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
   auto undef = builder.create<fir::UndefOp>(loc, idxTy);
   for (const LoweredSubscript &subscript : loweredSubscripts)
-    std::visit(Fortran::common::visitors{
-                   [&](const LoweredTriplet &triplet) {
-                     triples.emplace_back(triplet.lb);
-                     triples.emplace_back(triplet.ub);
-                     triples.emplace_back(triplet.stride);
-                   },
-                   [&](const LoweredVectorSubscript &vector) {
-                     triples.emplace_back(one);
-                     triples.emplace_back(vector.size);
-                     triples.emplace_back(one);
-                   },
-                   [&](const mlir::Value &i) {
-                     triples.emplace_back(i);
-                     triples.emplace_back(undef);
-                     triples.emplace_back(undef);
-                   },
-               },
-               subscript);
+    Fortran::common::visit(Fortran::common::visitors{
+                               [&](const LoweredTriplet &triplet) {
+                                 triples.emplace_back(triplet.lb);
+                                 triples.emplace_back(triplet.ub);
+                                 triples.emplace_back(triplet.stride);
+                               },
+                               [&](const LoweredVectorSubscript &vector) {
+                                 triples.emplace_back(one);
+                                 triples.emplace_back(vector.size);
+                                 triples.emplace_back(one);
+                               },
+                               [&](const mlir::Value &i) {
+                                 triples.emplace_back(i);
+                                 triples.emplace_back(undef);
+                                 triples.emplace_back(undef);
+                               },
+                           },
+                           subscript);
   return builder.create<fir::SliceOp>(loc, triples, componentPath);
 }
 
@@ -390,28 +391,28 @@ fir::ExtendedValue Fortran::lower::VectorSubscriptBox::getElementAt(
   llvm::SmallVector<mlir::Value> indexes;
   size_t inductionIdx = inductionVariables.size() - 1;
   for (const LoweredSubscript &subscript : loweredSubscripts)
-    std::visit(Fortran::common::visitors{
-                   [&](const LoweredTriplet &triplet) {
-                     indexes.emplace_back(inductionVariables[inductionIdx--]);
-                   },
-                   [&](const LoweredVectorSubscript &vector) {
-                     mlir::Value vecIndex = inductionVariables[inductionIdx--];
-                     mlir::Value vecBase = fir::getBase(vector.vector);
-                     mlir::Type vecEleTy = fir::unwrapSequenceType(
-                         fir::unwrapPassByRefType(vecBase.getType()));
-                     mlir::Type refTy = builder.getRefType(vecEleTy);
-                     auto vecEltRef = builder.create<fir::CoordinateOp>(
-                         loc, refTy, vecBase, vecIndex);
-                     auto vecElt =
-                         builder.create<fir::LoadOp>(loc, vecEleTy, vecEltRef);
-                     indexes.emplace_back(
-                         builder.createConvert(loc, idxTy, vecElt));
-                   },
-                   [&](const mlir::Value &i) {
-                     indexes.emplace_back(builder.createConvert(loc, idxTy, i));
-                   },
-               },
-               subscript);
+    Fortran::common::visit(
+        Fortran::common::visitors{
+            [&](const LoweredTriplet &triplet) {
+              indexes.emplace_back(inductionVariables[inductionIdx--]);
+            },
+            [&](const LoweredVectorSubscript &vector) {
+              mlir::Value vecIndex = inductionVariables[inductionIdx--];
+              mlir::Value vecBase = fir::getBase(vector.vector);
+              mlir::Type vecEleTy = fir::unwrapSequenceType(
+                  fir::unwrapPassByRefType(vecBase.getType()));
+              mlir::Type refTy = builder.getRefType(vecEleTy);
+              auto vecEltRef = builder.create<fir::CoordinateOp>(
+                  loc, refTy, vecBase, vecIndex);
+              auto vecElt =
+                  builder.create<fir::LoadOp>(loc, vecEleTy, vecEltRef);
+              indexes.emplace_back(builder.createConvert(loc, idxTy, vecElt));
+            },
+            [&](const mlir::Value &i) {
+              indexes.emplace_back(builder.createConvert(loc, idxTy, i));
+            },
+        },
+        subscript);
   mlir::Type refTy = builder.getRefType(getElementType());
   auto elementAddr = builder.create<fir::ArrayCoorOp>(
       loc, refTy, fir::getBase(loweredBase), shape, slice, indexes,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ab106f62aecfb..c929d05038462 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -1788,7 +1788,7 @@ IntrinsicLibrary::genIntrinsicCall(llvm::StringRef specificName,
   llvm::StringRef name = genericName(specificName);
   if (const IntrinsicHandler *handler = findIntrinsicHandler(name)) {
     bool outline = handler->outline || outlineAllIntrinsics;
-    return {std::visit(
+    return {Fortran::common::visit(
                 [&](auto &generator) -> fir::ExtendedValue {
                   return invokeHandler(generator, *handler, resultType, args,
                                        outline, *this);
@@ -1802,7 +1802,7 @@ IntrinsicLibrary::genIntrinsicCall(llvm::StringRef specificName,
   if (fir::getTargetTriple(mod).isPPC()) {
     if (const IntrinsicHandler *ppcHandler = findPPCIntrinsicHandler(name)) {
       bool outline = ppcHandler->outline || outlineAllIntrinsics;
-      return {std::visit(
+      return {Fortran::common::visit(
                   [&](auto &generator) -> fir::ExtendedValue {
                     return invokeHandler(generator, *ppcHandler, resultType,
                                          args, outline, *this);
@@ -2136,7 +2136,7 @@ mlir::SymbolRefAttr IntrinsicLibrary::getUnrestrictedIntrinsicSymbolRefAttr(
   bool loadRefArguments = true;
   mlir::func::FuncOp funcOp;
   if (const IntrinsicHandler *handler = findIntrinsicHandler(name))
-    funcOp = std::visit(
+    funcOp = Fortran::common::visit(
         [&](auto generator) {
           return getWrapper(generator, name, signature, loadRefArguments);
         },
diff --git a/flang/lib/Semantics/check-acc-structure.cpp b/flang/lib/Semantics/check-acc-structure.cpp
index 69b9fe17e6a88..25140a0473749 100644
--- a/flang/lib/Semantics/check-acc-structure.cpp
+++ b/flang/lib/Semantics/check-acc-structure.cpp
@@ -403,9 +403,9 @@ void AccStructureChecker::CheckMultipleOccurrenceInDeclare(
   if (GetContext().directive != llvm::acc::Directive::ACCD_declare)
     return;
   for (const auto &object : list.v) {
-    std::visit(
-        Fortran::common::visitors{
-            [&](const Fortran::parser::Designator &designator) {
+    common::visit(
+        common::visitors{
+            [&](const parser::Designator &designator) {
               if (const auto *name = getDesignatorNameIfDataRef(designator)) {
                 if (declareSymbols.contains(&name->symbol->GetUltimate())) {
                   if (declareSymbols[&name->symbol->GetUltimate()] == clause) {
@@ -435,7 +435,7 @@ void AccStructureChecker::CheckMultipleOccurrenceInDeclare(
                 declareSymbols.insert({&name->symbol->GetUltimate(), clause});
               }
             },
-            [&](const Fortran::parser::Name &name) {
+            [&](const parser::Name &name) {
               // TODO: check common block
             }},
         object.u);
@@ -674,9 +674,9 @@ void AccStructureChecker::Enter(const parser::AccClause::Reduction &reduction) {
   const auto &objects{std::get<parser::AccObjectList>(list.t)};
 
   for (const auto &object : objects.v) {
-    std::visit(
-        Fortran::common::visitors{
-            [&](const Fortran::parser::Designator &designator) {
+    common::visit(
+        common::visitors{
+            [&](const parser::Designator &designator) {
               if (const auto *name = getDesignatorNameIfDataRef(designator)) {
                 const auto *type{name->symbol->GetType()};
                 if (type->IsNumeric(TypeCategory::Integer) &&
diff --git a/flang/lib/Semantics/check-coarray.cpp b/flang/lib/Semantics/check-coarray.cpp
index 106af7960fa94..6cf61a6b923db 100644
--- a/flang/lib/Semantics/check-coarray.cpp
+++ b/flang/lib/Semantics/check-coarray.cpp
@@ -93,7 +93,7 @@ static void CheckCoindexedStatOrErrmsg(SemanticsContext &context,
       }
     }
   }};
-  std::visit(CoindexedCheck, statOrErrmsg.u);
+  Fortran::common::visit(CoindexedCheck, statOrErrmsg.u);
 }
 
 static void CheckSyncStatList(

From 7ddff3a586baaf6f4403183ba51121951ce0602e Mon Sep 17 00:00:00 2001
From: Abid Qadeer <haqadeer@amd.com>
Date: Mon, 17 Jun 2024 21:00:09 +0100
Subject: [PATCH 06/25] [flang] Fix comments and formatting. (NFC) (#95786)

As mentioned in
[here](https://github.com/llvm/llvm-project/pull/95462#discussion_r1640126721),
the formatting of the comments have been fixed. Also added comments
before literal arguments.
---
 .../Transforms/DebugTypeGenerator.cpp         | 24 +++++++++----------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 70ff8b386ac3d..407ecc8e327b4 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -75,8 +75,8 @@ static mlir::LLVM::DITypeAttr genBasicType(mlir::MLIRContext *context,
 }
 
 static mlir::LLVM::DITypeAttr genPlaceholderType(mlir::MLIRContext *context) {
-  return genBasicType(context, mlir::StringAttr::get(context, "integer"), 32,
-                      llvm::dwarf::DW_ATE_signed);
+  return genBasicType(context, mlir::StringAttr::get(context, "integer"),
+                      /*bitSize=*/32, llvm::dwarf::DW_ATE_signed);
 }
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
@@ -145,11 +145,10 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertBoxedSequenceType(
     elements.push_back(subrangeTy);
   }
   return mlir::LLVM::DICompositeTypeAttr::get(
-      context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {},
-      /* name */ nullptr, /* file */ nullptr, /* line */ 0,
-      /* scope */ nullptr, elemTy, mlir::LLVM::DIFlags::Zero,
-      /* sizeInBits */ 0, /*alignInBits*/ 0, elements, dataLocation,
-      /* rank */ nullptr, allocated, associated);
+      context, llvm::dwarf::DW_TAG_array_type, /*recursive_id=*/{},
+      /*name=*/nullptr, /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy,
+      mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0, /*alignInBits=*/0, elements,
+      dataLocation, /*rank=*/nullptr, allocated, associated);
 }
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
@@ -184,12 +183,11 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
   // have been set to some valid default values.
 
   return mlir::LLVM::DICompositeTypeAttr::get(
-      context, llvm::dwarf::DW_TAG_array_type, /*recursive id*/ {},
-      /* name */ nullptr, /* file */ nullptr, /* line */ 0, /* scope */ nullptr,
-      elemTy, mlir::LLVM::DIFlags::Zero, /* sizeInBits */ 0,
-      /*alignInBits*/ 0, elements, /* dataLocation */ nullptr,
-      /* rank */ nullptr, /* allocated */ nullptr,
-      /* associated */ nullptr);
+      context, llvm::dwarf::DW_TAG_array_type, /*recursive_id=*/{},
+      /*name=*/nullptr, /*file=*/nullptr, /*line=*/0, /*scope=*/nullptr, elemTy,
+      mlir::LLVM::DIFlags::Zero, /*sizeInBits=*/0, /*alignInBits=*/0, elements,
+      /*dataLocation=*/nullptr, /*rank=*/nullptr, /*allocated=*/nullptr,
+      /*associated=*/nullptr);
 }
 
 mlir::LLVM::DITypeAttr DebugTypeGenerator::convertCharacterType(

From 7620fe0d2d1e0257611c0ab0d96f3bf1bf7a1079 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 17 Jun 2024 16:20:58 -0400
Subject: [PATCH 07/25] [CI][format] Explicitly pass extensions to
 git-clang-format (#95794)

This ensures that the CI script controls which file extensions are
considered instead of letting git-clang-format apply its own filtering
rules. In particular, this properly handles libc++ extension-less
headers which were passed to git-clang-format, but then dropped by that
tool as having an unrecognized extension.
---
 llvm/utils/git/code-format-helper.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/utils/git/code-format-helper.py b/llvm/utils/git/code-format-helper.py
index f1207026704e8..d60d4131bc94b 100755
--- a/llvm/utils/git/code-format-helper.py
+++ b/llvm/utils/git/code-format-helper.py
@@ -216,6 +216,17 @@ def format_run(self, changed_files: List[str], args: FormatArgs) -> Optional[str
             cf_cmd.append(args.start_rev)
             cf_cmd.append(args.end_rev)
 
+        # Gather the extension of all modified files and pass them explicitly to git-clang-format.
+        # This prevents git-clang-format from applying its own filtering rules on top of ours.
+        extensions = set()
+        for file in cpp_files:
+            _, ext = os.path.splitext(file)
+            extensions.add(
+                ext.strip(".")
+            )  # Exclude periods since git-clang-format takes extensions without them
+        cf_cmd.append("--extensions")
+        cf_cmd.append("'{}'".format(",".join(extensions)))
+
         cf_cmd.append("--")
         cf_cmd += cpp_files
 

From 5b04b6fe3fabba8f76d730da3c0d528e1dd0c184 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 17 Jun 2024 13:21:34 -0700
Subject: [PATCH 08/25] [HWASan] [compiler-rt] support non-4k pages on Android
 (#95069)

---
 compiler-rt/lib/asan/asan_linux.cpp             |  3 ++-
 compiler-rt/lib/asan/asan_mac.cpp               |  3 ++-
 compiler-rt/lib/asan/asan_premap_shadow.cpp     |  3 ++-
 .../lib/hwasan/hwasan_dynamic_shadow.cpp        | 17 +++++++++++------
 compiler-rt/lib/memprof/memprof_linux.cpp       |  3 ++-
 .../lib/sanitizer_common/sanitizer_common.h     | 17 ++++++++---------
 .../lib/sanitizer_common/sanitizer_linux.cpp    |  4 ++--
 .../sanitizer_linux_libcdep.cpp                 |  5 ++---
 .../lib/sanitizer_common/sanitizer_mac.cpp      |  4 ++--
 .../lib/sanitizer_common/sanitizer_win.cpp      |  5 ++---
 10 files changed, 35 insertions(+), 29 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_linux.cpp b/compiler-rt/lib/asan/asan_linux.cpp
index a517de5af00dc..0b470db86748f 100644
--- a/compiler-rt/lib/asan/asan_linux.cpp
+++ b/compiler-rt/lib/asan/asan_linux.cpp
@@ -93,7 +93,8 @@ uptr FindDynamicShadowStart() {
 #  endif
 
   return MapDynamicShadow(shadow_size_bytes, ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
diff --git a/compiler-rt/lib/asan/asan_mac.cpp b/compiler-rt/lib/asan/asan_mac.cpp
index b250f796e165f..bfc349223258b 100644
--- a/compiler-rt/lib/asan/asan_mac.cpp
+++ b/compiler-rt/lib/asan/asan_mac.cpp
@@ -51,7 +51,8 @@ bool IsSystemHeapAddress (uptr addr) { return false; }
 
 uptr FindDynamicShadowStart() {
   return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 // No-op. Mac does not support static linkage anyway.
diff --git a/compiler-rt/lib/asan/asan_premap_shadow.cpp b/compiler-rt/lib/asan/asan_premap_shadow.cpp
index bed2f62a22511..6e08b8f966507 100644
--- a/compiler-rt/lib/asan/asan_premap_shadow.cpp
+++ b/compiler-rt/lib/asan/asan_premap_shadow.cpp
@@ -33,7 +33,8 @@ uptr PremapShadowSize() {
 // PremapShadowSize() bytes on the right of it are mapped r/o.
 uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), /*mmap_alignment_scale*/ 3,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 bool PremapShadowFailed() {
diff --git a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
index 7642ba6c0bf08..48bc3b631ac07 100644
--- a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
@@ -36,15 +36,20 @@ decltype(__hwasan_shadow)* __hwasan_premap_shadow();
 
 namespace __hwasan {
 
+// We cannot call anything in libc here (see comment above), so we need to
+// assume the biggest allowed page size.
+// Android max page size is defined as 16k here:
+// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/page.h#41
+static constexpr uptr kMaxGranularity = 16384;
+
 // Conservative upper limit.
 static uptr PremapShadowSize() {
-  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale,
-                   GetMmapGranularity());
+  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale, kMaxGranularity);
 }
 
 static uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), kShadowScale,
-                          kShadowBaseAlignment, kHighMemEnd);
+                          kShadowBaseAlignment, kHighMemEnd, kMaxGranularity);
 }
 
 static bool IsPremapShadowAvailable() {
@@ -56,7 +61,7 @@ static bool IsPremapShadowAvailable() {
 }
 
 static uptr FindPremappedShadowStart(uptr shadow_size_bytes) {
-  const uptr granularity = GetMmapGranularity();
+  const uptr granularity = kMaxGranularity;
   const uptr shadow_start = reinterpret_cast<uptr>(&__hwasan_shadow);
   const uptr premap_shadow_size = PremapShadowSize();
   const uptr shadow_size = RoundUpTo(shadow_size_bytes, granularity);
@@ -109,7 +114,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
   if (IsPremapShadowAvailable())
     return FindPremappedShadowStart(shadow_size_bytes);
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd);
+                          kHighMemEnd, kMaxGranularity);
 }
 
 }  // namespace __hwasan
@@ -135,7 +140,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
                                     RingBufferSize());
 #  endif
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd);
+                          kHighMemEnd, GetMmapGranularity());
 }
 
 }  // namespace __hwasan
diff --git a/compiler-rt/lib/memprof/memprof_linux.cpp b/compiler-rt/lib/memprof/memprof_linux.cpp
index 26a2b456b874e..fbe5d250f840b 100644
--- a/compiler-rt/lib/memprof/memprof_linux.cpp
+++ b/compiler-rt/lib/memprof/memprof_linux.cpp
@@ -58,7 +58,8 @@ void InitializePlatformExceptionHandlers() {}
 uptr FindDynamicShadowStart() {
   uptr shadow_size_bytes = MemToShadowSize(kHighMemEnd);
   return MapDynamicShadow(shadow_size_bytes, SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 void *MemprofDlSymNext(const char *sym) { return dlsym(RTLD_NEXT, sym); }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index c451fc962c529..2d1059140c303 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -60,14 +60,10 @@ inline int Verbosity() {
   return atomic_load(&current_verbosity, memory_order_relaxed);
 }
 
-#if SANITIZER_ANDROID
-inline uptr GetPageSize() {
-// Android post-M sysconf(_SC_PAGESIZE) crashes if called from .preinit_array.
-  return 4096;
-}
-inline uptr GetPageSizeCached() {
-  return 4096;
-}
+#if SANITIZER_ANDROID && !defined(__aarch64__)
+// 32-bit Android only has 4k pages.
+inline uptr GetPageSize() { return 4096; }
+inline uptr GetPageSizeCached() { return 4096; }
 #else
 uptr GetPageSize();
 extern uptr PageSizeCached;
@@ -77,6 +73,7 @@ inline uptr GetPageSizeCached() {
   return PageSizeCached;
 }
 #endif
+
 uptr GetMmapGranularity();
 uptr GetMaxVirtualAddress();
 uptr GetMaxUserVirtualAddress();
@@ -91,6 +88,7 @@ void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
 
 // Memory management
 void *MmapOrDie(uptr size, const char *mem_type, bool raw_report = false);
+
 inline void *MmapOrDieQuietly(uptr size, const char *mem_type) {
   return MmapOrDie(size, mem_type, /*raw_report*/ true);
 }
@@ -139,7 +137,8 @@ void UnmapFromTo(uptr from, uptr to);
 // shadow_size_bytes bytes on the right, which on linux is mapped no access.
 // The high_mem_end may be updated if the original shadow size doesn't fit.
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end);
+                      uptr min_shadow_base_alignment, uptr &high_mem_end,
+                      uptr granularity);
 
 // Let S = max(shadow_size, num_aliases * alias_size, ring_buffer_size).
 // Reserves 2*S bytes of address space to the right of the returned address and
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 5d2dd3a7a658f..d15caa76efb06 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1136,7 +1136,7 @@ uptr GetMaxUserVirtualAddress() {
   return addr;
 }
 
-#  if !SANITIZER_ANDROID
+#  if !SANITIZER_ANDROID || defined(__aarch64__)
 uptr GetPageSize() {
 #    if SANITIZER_LINUX && (defined(__x86_64__) || defined(__i386__)) && \
         defined(EXEC_PAGESIZE)
@@ -1155,7 +1155,7 @@ uptr GetPageSize() {
   return sysconf(_SC_PAGESIZE);  // EXEC_PAGESIZE may not be trustworthy.
 #    endif
 }
-#  endif  // !SANITIZER_ANDROID
+#  endif
 
 uptr ReadBinaryName(/*out*/ char *buf, uptr buf_len) {
 #  if SANITIZER_SOLARIS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 6d05411222d9e..175362183fd78 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -995,9 +995,8 @@ void UnmapFromTo(uptr from, uptr to) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment,
-                      UNUSED uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 24e3d1112520e..cbdf3e95925bf 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -1188,8 +1188,8 @@ uptr GetMaxVirtualAddress() {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
index 4e5ad8e4693b4..0b198890fc798 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
@@ -384,9 +384,8 @@ bool DontDumpShadowMemory(uptr addr, uptr length) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment,
-                      UNUSED uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =

From d6cc35f7f67575f2d3534ea385c2f36f48f49aea Mon Sep 17 00:00:00 2001
From: Peiming Liu <peiming@google.com>
Date: Mon, 17 Jun 2024 13:29:53 -0700
Subject: [PATCH 09/25] Reapply "[mlir][sparse] implement lowering rules for
 IterateOp." (#95836)

---
 .../Transforms/SparseIterationToScf.cpp       | 121 +++++++++++++++++-
 .../Transforms/Utils/SparseTensorIterator.cpp |  40 ++++++
 .../Transforms/Utils/SparseTensorIterator.h   |  26 +++-
 .../SparseTensor/sparse_iteration_to_scf.mlir |  54 ++++++--
 4 files changed, 224 insertions(+), 17 deletions(-)

diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
index 62887c75c872b..4224925147c84 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseIterationToScf.cpp
@@ -34,6 +34,20 @@ convertIterSpaceType(IterSpaceType itSp, SmallVectorImpl<Type> &fields) {
   return success();
 }
 
+static std::optional<LogicalResult>
+convertIteratorType(IteratorType itTp, SmallVectorImpl<Type> &fields) {
+  // The actually Iterator Values (that are updated every iteration).
+  auto idxTp = IndexType::get(itTp.getContext());
+  // TODO: handle batch dimension.
+  assert(itTp.getEncoding().getBatchLvlRank() == 0);
+  if (!itTp.isUnique()) {
+    // Segment high for non-unique iterator.
+    fields.push_back(idxTp);
+  }
+  fields.push_back(idxTp);
+  return success();
+}
+
 namespace {
 
 /// Sparse codegen rule for number of entries operator.
@@ -57,10 +71,114 @@ class ExtractIterSpaceConverter
   }
 };
 
+class SparseIterateOpConverter : public OneToNOpConversionPattern<IterateOp> {
+public:
+  using OneToNOpConversionPattern::OneToNOpConversionPattern;
+  LogicalResult
+  matchAndRewrite(IterateOp op, OpAdaptor adaptor,
+                  OneToNPatternRewriter &rewriter) const override {
+    if (!op.getCrdUsedLvls().empty())
+      return rewriter.notifyMatchFailure(
+          op, "non-empty coordinates list not implemented.");
+
+    Location loc = op.getLoc();
+
+    auto iterSpace = SparseIterationSpace::fromValues(
+        op.getIterSpace().getType(), adaptor.getIterSpace(), 0);
+
+    std::unique_ptr<SparseIterator> it =
+        iterSpace.extractIterator(rewriter, loc);
+
+    if (it->iteratableByFor()) {
+      auto [lo, hi] = it->genForCond(rewriter, loc);
+      Value step = constantIndex(rewriter, loc, 1);
+      SmallVector<Value> ivs;
+      for (ValueRange inits : adaptor.getInitArgs())
+        llvm::append_range(ivs, inits);
+      scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, ivs);
+
+      Block *loopBody = op.getBody();
+      OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes());
+      if (failed(typeConverter->convertSignatureArgs(
+              loopBody->getArgumentTypes(), bodyTypeMapping)))
+        return failure();
+      rewriter.applySignatureConversion(loopBody, bodyTypeMapping);
+
+      rewriter.eraseBlock(forOp.getBody());
+      Region &dstRegion = forOp.getRegion();
+      rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end());
+
+      auto yieldOp =
+          llvm::cast<sparse_tensor::YieldOp>(forOp.getBody()->getTerminator());
+
+      rewriter.setInsertionPointToEnd(forOp.getBody());
+      // replace sparse_tensor.yield with scf.yield.
+      rewriter.create<scf::YieldOp>(loc, yieldOp.getResults());
+      rewriter.eraseOp(yieldOp);
+
+      const OneToNTypeMapping &resultMapping = adaptor.getResultMapping();
+      rewriter.replaceOp(op, forOp.getResults(), resultMapping);
+    } else {
+      SmallVector<Value> ivs;
+      llvm::append_range(ivs, it->getCursor());
+      for (ValueRange inits : adaptor.getInitArgs())
+        llvm::append_range(ivs, inits);
+
+      assert(llvm::all_of(ivs, [](Value v) { return v != nullptr; }));
+
+      TypeRange types = ValueRange(ivs).getTypes();
+      auto whileOp = rewriter.create<scf::WhileOp>(loc, types, ivs);
+      SmallVector<Location> l(types.size(), op.getIterator().getLoc());
+
+      // Generates loop conditions.
+      Block *before = rewriter.createBlock(&whileOp.getBefore(), {}, types, l);
+      rewriter.setInsertionPointToStart(before);
+      ValueRange bArgs = before->getArguments();
+      auto [whileCond, remArgs] = it->genWhileCond(rewriter, loc, bArgs);
+      assert(remArgs.size() == adaptor.getInitArgs().size());
+      rewriter.create<scf::ConditionOp>(loc, whileCond, before->getArguments());
+
+      // Generates loop body.
+      Block *loopBody = op.getBody();
+      OneToNTypeMapping bodyTypeMapping(loopBody->getArgumentTypes());
+      if (failed(typeConverter->convertSignatureArgs(
+              loopBody->getArgumentTypes(), bodyTypeMapping)))
+        return failure();
+      rewriter.applySignatureConversion(loopBody, bodyTypeMapping);
+
+      Region &dstRegion = whileOp.getAfter();
+      // TODO: handle uses of coordinate!
+      rewriter.inlineRegionBefore(op.getRegion(), dstRegion, dstRegion.end());
+      ValueRange aArgs = whileOp.getAfterArguments();
+      auto yieldOp = llvm::cast<sparse_tensor::YieldOp>(
+          whileOp.getAfterBody()->getTerminator());
+
+      rewriter.setInsertionPointToEnd(whileOp.getAfterBody());
+
+      aArgs = it->linkNewScope(aArgs);
+      ValueRange nx = it->forward(rewriter, loc);
+      SmallVector<Value> yields;
+      llvm::append_range(yields, nx);
+      llvm::append_range(yields, yieldOp.getResults());
+
+      // replace sparse_tensor.yield with scf.yield.
+      rewriter.eraseOp(yieldOp);
+      rewriter.create<scf::YieldOp>(loc, yields);
+
+      const OneToNTypeMapping &resultMapping = adaptor.getResultMapping();
+      rewriter.replaceOp(
+          op, whileOp.getResults().drop_front(it->getCursor().size()),
+          resultMapping);
+    }
+    return success();
+  }
+};
+
 } // namespace
 
 mlir::SparseIterationTypeConverter::SparseIterationTypeConverter() {
   addConversion([](Type type) { return type; });
+  addConversion(convertIteratorType);
   addConversion(convertIterSpaceType);
 
   addSourceMaterialization([](OpBuilder &builder, IterSpaceType spTp,
@@ -74,5 +192,6 @@ mlir::SparseIterationTypeConverter::SparseIterationTypeConverter() {
 
 void mlir::populateLowerSparseIterationToSCFPatterns(
     TypeConverter &converter, RewritePatternSet &patterns) {
-  patterns.add<ExtractIterSpaceConverter>(converter, patterns.getContext());
+  patterns.add<ExtractIterSpaceConverter, SparseIterateOpConverter>(
+      converter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
index be8e15d6ae6f4..ef95fcc84bd90 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.cpp
@@ -331,6 +331,13 @@ class TrivialIterator : public ConcreteIterator {
   TrivialIterator(const SparseTensorLevel &stl)
       : ConcreteIterator(stl, IterKind::kTrivial, /*itValCnt=*/1) {}
 
+  TrivialIterator(OpBuilder &b, Location l, const SparseTensorLevel &stl,
+                  Value posLo, Value posHi)
+      : ConcreteIterator(stl, IterKind::kTrivial, /*itValCnt=*/1), posLo(posLo),
+        posHi(posHi) {
+    seek(posLo);
+  }
+
   std::string getDebugInterfacePrefix() const override {
     return std::string("trivial<") + stl.toString() + ">";
   }
@@ -420,6 +427,14 @@ class DedupIterator : public ConcreteIterator {
       : ConcreteIterator(stl, IterKind::kDedup, /*itValCnt=*/2) {
     assert(!stl.isUnique());
   }
+
+  DedupIterator(OpBuilder &b, Location l, const SparseTensorLevel &stl,
+                Value posLo, Value posHi)
+      : ConcreteIterator(stl, IterKind::kDedup, /*itValCnt=*/2), posHi(posHi) {
+    assert(!stl.isUnique());
+    seek({posLo, genSegmentHigh(b, l, posLo)});
+  }
+
   // For LLVM-style RTTI.
   static bool classof(const SparseIterator *from) {
     return from->kind == IterKind::kDedup;
@@ -1532,6 +1547,11 @@ SparseIterationSpace mlir::sparse_tensor::SparseIterationSpace::fromValues(
   return space;
 }
 
+std::unique_ptr<SparseIterator>
+SparseIterationSpace::extractIterator(OpBuilder &b, Location l) const {
+  return makeSimpleIterator(b, l, *this);
+}
+
 //===----------------------------------------------------------------------===//
 // SparseIterator factory functions.
 //===----------------------------------------------------------------------===//
@@ -1590,6 +1610,26 @@ sparse_tensor::makeSynLevelAndIterator(Value sz, unsigned tid, unsigned lvl,
   return std::make_pair(std::move(stl), std::move(it));
 }
 
+std::unique_ptr<SparseIterator>
+sparse_tensor::makeSimpleIterator(OpBuilder &b, Location l,
+                                  const SparseIterationSpace &iterSpace) {
+  // assert(iterSpace.getSpaceDim() == 1);
+  std::unique_ptr<SparseIterator> ret;
+  if (!iterSpace.isUnique()) {
+    // We always dedupliate the non-unique level, but we should optimize it away
+    // if possible.
+    ret = std::make_unique<DedupIterator>(b, l, iterSpace.getLastLvl(),
+                                          iterSpace.getBoundLo(),
+                                          iterSpace.getBoundHi());
+  } else {
+    ret = std::make_unique<TrivialIterator>(b, l, iterSpace.getLastLvl(),
+                                            iterSpace.getBoundLo(),
+                                            iterSpace.getBoundHi());
+  }
+  ret->setSparseEmitStrategy(SparseEmitStrategy::kFunctional);
+  return ret;
+}
+
 std::unique_ptr<SparseIterator>
 sparse_tensor::makeSimpleIterator(const SparseTensorLevel &stl,
                                   SparseEmitStrategy strategy) {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
index 17636af2b2f9d..91f363db93f1d 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/SparseTensorIterator.h
@@ -132,6 +132,10 @@ class SparseIterationSpace {
   Value getBoundLo() const { return bound.first; }
   Value getBoundHi() const { return bound.second; }
 
+  // Extract an iterator to iterate over the sparse iteration space.
+  std::unique_ptr<SparseIterator> extractIterator(OpBuilder &b,
+                                                  Location l) const;
+
 private:
   SmallVector<std::unique_ptr<SparseTensorLevel>> lvls;
   std::pair<Value, Value> bound;
@@ -192,6 +196,13 @@ class SparseIterator {
     crd = nullptr;
   }
 
+  // Reconstructs a iteration space directly from the provided ValueRange.
+  static std::unique_ptr<SparseIterator>
+  fromValues(IteratorType dstTp, ValueRange values, unsigned tid);
+
+  // The inverse operation of `fromValues`.
+  SmallVector<Value> toValues() const { llvm_unreachable("Not implemented"); }
+
   //
   // Iterator properties.
   //
@@ -345,12 +356,21 @@ std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(OpBuilder &b,
                                                          unsigned tid,
                                                          Level lvl);
 
-/// Helper function to create a TensorLevel object from given `tensor`.
+/// Helper function to create a TensorLevel object from given ValueRange.
 std::unique_ptr<SparseTensorLevel> makeSparseTensorLevel(LevelType lt, Value sz,
                                                          ValueRange buffers,
                                                          unsigned tid, Level l);
-/// Helper function to create a simple SparseIterator object that iterates
-/// over the SparseTensorLevel.
+
+/// Helper function to create a simple SparseIterator object that iterate
+/// over the entire iteration space.
+std::unique_ptr<SparseIterator>
+makeSimpleIterator(OpBuilder &b, Location l,
+                   const SparseIterationSpace &iterSpace);
+
+/// Helper function to create a simple SparseIterator object that iterate
+/// over the sparse tensor level.
+/// TODO: switch to `SparseIterationSpace` (which support N-D iterator) when
+/// feature complete.
 std::unique_ptr<SparseIterator> makeSimpleIterator(
     const SparseTensorLevel &stl,
     SparseEmitStrategy strategy = SparseEmitStrategy::kFunctional);
diff --git a/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir b/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir
index 5fcd661bb69b2..77a0e89dc7c81 100644
--- a/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir
+++ b/mlir/test/Dialect/SparseTensor/sparse_iteration_to_scf.mlir
@@ -1,4 +1,5 @@
 // RUN: mlir-opt %s --lower-sparse-iteration-to-scf | FileCheck %s
+// RUN: mlir-opt %s --sparse-space-collapse --lower-sparse-iteration-to-scf | FileCheck %s --check-prefix COLLAPSED
 
 #COO = #sparse_tensor.encoding<{
   map = (i, j) -> (
@@ -7,17 +8,44 @@
   )
 }>
 
-// CHECK-LABEL:   func.func @sparse_1D_space(
-// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf32, #sparse{{[0-9]*}}>) -> !sparse_tensor.iter_space<#sparse{{[0-9]*}}, lvls = 0> {
-// CHECK-DAG:       %[[C0:.*]] = arith.constant 0 : index
-// CHECK-DAG:       %[[C1:.*]] = arith.constant 1 : index
-// CHECK-DAG:       %[[LVL_SIZE:.*]] = sparse_tensor.lvl %[[VAL_0]], %[[C0]] : tensor<?x?xf32, #sparse{{[0-9]*}}>
-// CHECK:           %[[POS_MEM:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 0 : index} : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK:           %[[CRD_MEM:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 0 : index} : tensor<?x?xf32, #sparse{{[0-9]*}}> to memref<?xindex>
-// CHECK:           %[[POS_LO:.*]] = memref.load %[[POS_MEM]]{{\[}}%[[C0]]] : memref<?xindex>
-// CHECK:           %[[POS_HI:.*]] = memref.load %[[POS_MEM]]{{\[}}%[[C1]]] : memref<?xindex>
-// CHECK:           %[[ITER_SPACE:.*]] = builtin.unrealized_conversion_cast %[[POS_MEM]], %[[CRD_MEM]], %[[LVL_SIZE]], %[[POS_LO]], %[[POS_HI]]
-func.func @sparse_1D_space(%sp : tensor<?x?xf32, #COO>) -> !sparse_tensor.iter_space<#COO, lvls = 0> {
-  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0 : tensor<?x?xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
-  return %l1 : !sparse_tensor.iter_space<#COO, lvls = 0>
+// CHECK-LABEL:   @sparse_iteration_to_scf
+//                  // deduplication
+// CHECK:           scf.while {{.*}} {
+// CHECK:           } do {
+// CHECK:           }
+// CHECK:           scf.while {{.*}} {
+// CHECK:           } do {
+//                    // actual computation
+// CHECK:             scf.for {{.*}} {
+// CHECK:               arith.addi
+// CHECK:             }
+//                    // deduplication
+// CHECK:             scf.while {{.*}} {
+// CHECK:             } do {
+// CHECK:             }
+// CHECK:             scf.yield
+// CHECK:           }
+// CHECK:           return
+
+// COLLAPSED-LABEL:   @sparse_iteration_to_scf
+// COLLAPSED:           %[[RET:.*]] = scf.for {{.*}} {
+// COLLAPSED:             %[[VAL:.*]] = arith.addi
+// COLLAPSED:             scf.yield %[[VAL]] : index
+// COLLAPSED:           }
+// COLLAPSED:           return %[[RET]] : index
+func.func @sparse_iteration_to_scf(%sp : tensor<4x8xf32, #COO>) -> index {
+  %i = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %l1 = sparse_tensor.extract_iteration_space %sp lvls = 0
+      : tensor<4x8xf32, #COO> -> !sparse_tensor.iter_space<#COO, lvls = 0>
+  %r1 = sparse_tensor.iterate %it1 in %l1 iter_args(%outer = %i): !sparse_tensor.iter_space<#COO, lvls = 0 to 1> -> index {
+    %l2 = sparse_tensor.extract_iteration_space %sp at %it1 lvls = 1
+        : tensor<4x8xf32, #COO>, !sparse_tensor.iterator<#COO, lvls = 0 to 1> -> !sparse_tensor.iter_space<#COO, lvls = 1>
+    %r2 = sparse_tensor.iterate %it2 in %l2 iter_args(%inner = %outer): !sparse_tensor.iter_space<#COO, lvls = 1 to 2> -> index {
+      %k = arith.addi %inner, %c1 : index
+      sparse_tensor.yield %k : index
+    }
+    sparse_tensor.yield %r2 : index
+  }
+  return %r1 : index
 }

From dcb6c0d71c8dbb6bb17391c968c3716cfafd3765 Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 17 Jun 2024 15:44:35 -0500
Subject: [PATCH 10/25] Reland [mlir][Target] Improve ROCDL gpu serialization
 API (#95813)

Reland: https://github.com/llvm/llvm-project/pull/95456

This patch improves the ROCDL gpu serialization API by:
- Introducing the enum `AMDGCNLibraries` for specifying the AMD GCN
device code libraries to use during linking.
- Removing `getCommonBitcodeLibs` in favor of `AMDGCNLibraries`.
Previously `getCommonBitcodeLibs` would try to load all AMD GCN bitcode
librariesm now it will only load the requested libraries.
- Exposing the `compileToBinary` method and making it virtual, allowing
downstream users to re-use this method.
- Exposing `moduleToObjectImpl`, this method provides a prototype flow
for compiling to binary, allowing downstream users to re-use this
method.
- It also avoids constructing the control variables if no device
libraries are being used.
- Changes the style of the error messages to be composable, ie no full
stops.
- Adds an error message for when the ROCm toolkit can't be found but it
was required.
---
 mlir/include/mlir/Target/LLVM/ROCDL/Utils.h |  41 ++-
 mlir/lib/Dialect/GPU/CMakeLists.txt         |   2 +-
 mlir/lib/Target/LLVM/CMakeLists.txt         |  13 +-
 mlir/lib/Target/LLVM/ROCDL/Target.cpp       | 306 +++++++++++---------
 4 files changed, 213 insertions(+), 149 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
index 374fa65bd02e3..44c9ded317fa5 100644
--- a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
@@ -27,6 +27,19 @@ namespace ROCDL {
 /// 5. Returns an empty string.
 StringRef getROCMPath();
 
+/// Helper enum for specifying the AMD GCN device libraries required for
+/// compilation.
+enum class AMDGCNLibraries : uint32_t {
+  None = 0,
+  Ockl = 1,
+  Ocml = 2,
+  OpenCL = 4,
+  Hip = 8,
+  LastLib = Hip,
+  LLVM_MARK_AS_BITMASK_ENUM(LastLib),
+  All = (LastLib << 1) - 1
+};
+
 /// Base class for all ROCDL serializations from GPU modules into binary
 /// strings. By default this class serializes into LLVM bitcode.
 class SerializeGPUModuleBase : public LLVM::ModuleToObject {
@@ -49,8 +62,8 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   /// Returns the bitcode files to be loaded.
   ArrayRef<std::string> getFileList() const;
 
-  /// Appends standard ROCm device libraries like `ocml.bc`, `ockl.bc`, etc.
-  LogicalResult appendStandardLibs();
+  /// Appends standard ROCm device libraries to `fileList`.
+  LogicalResult appendStandardLibs(AMDGCNLibraries libs);
 
   /// Loads the bitcode files in `fileList`.
   virtual std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
@@ -63,15 +76,20 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   LogicalResult handleBitcodeFile(llvm::Module &module) override;
 
 protected:
-  /// Appends the paths of common ROCm device libraries to `libs`.
-  LogicalResult getCommonBitcodeLibs(llvm::SmallVector<std::string> &libs,
-                                     SmallVector<char, 256> &libPath,
-                                     StringRef isaVersion);
-
   /// Adds `oclc` control variables to the LLVM module.
-  void addControlVariables(llvm::Module &module, bool wave64, bool daz,
-                           bool finiteOnly, bool unsafeMath, bool fastMath,
-                           bool correctSqrt, StringRef abiVer);
+  void addControlVariables(llvm::Module &module, AMDGCNLibraries libs,
+                           bool wave64, bool daz, bool finiteOnly,
+                           bool unsafeMath, bool fastMath, bool correctSqrt,
+                           StringRef abiVer);
+
+  /// Compiles assembly to a binary.
+  virtual std::optional<SmallVector<char, 0>>
+  compileToBinary(const std::string &serializedISA);
+
+  /// Default implementation of `ModuleToObject::moduleToObject`.
+  std::optional<SmallVector<char, 0>>
+  moduleToObjectImpl(const gpu::TargetOptions &targetOptions,
+                     llvm::Module &llvmModule);
 
   /// Returns the assembled ISA.
   std::optional<SmallVector<char, 0>> assembleIsa(StringRef isa);
@@ -84,6 +102,9 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
 
   /// List of LLVM bitcode files to link to.
   SmallVector<std::string> fileList;
+
+  /// AMD GCN libraries to use when linking, the default is using none.
+  AMDGCNLibraries deviceLibs = AMDGCNLibraries::None;
 };
 } // namespace ROCDL
 } // namespace mlir
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 61ab298ebfb98..08c8aea36fac9 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -106,7 +106,7 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
       "Building mlir with ROCm support requires the AMDGPU backend")
   endif()
 
-  set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+  set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
   target_compile_definitions(obj.MLIRGPUTransforms
     PRIVATE
     __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 5a3fa160850b4..6e146710d67af 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -123,17 +123,18 @@ add_mlir_dialect_library(MLIRROCDLTarget
   )
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
-  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
-    message(SEND_ERROR
-      "Building mlir with ROCm support requires the AMDGPU backend")
-  endif()
-
   if (DEFINED ROCM_PATH)
     set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   elseif(DEFINED ENV{ROCM_PATH})
     set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   else()
-    set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    IF (WIN32)
+      # Avoid setting an UNIX path for Windows.
+      # TODO: Eventually migrate to FindHIP once it becomes a part of CMake.
+      set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
+    else()
+      set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
+    endif()
   endif()
   message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}")
 
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index cc13e5b7436ea..6784f3668bde3 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -17,9 +17,6 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVM/ROCDL/Utils.h"
-#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
-#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include "llvm/IR/Constants.h"
@@ -112,8 +109,9 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
       if (auto file = dyn_cast<StringAttr>(attr))
         fileList.push_back(file.str());
 
-  // Append standard ROCm device bitcode libraries to the files to be loaded.
-  (void)appendStandardLibs();
+  // By default add all libraries if the toolkit path is not empty.
+  if (!getToolkitPath().empty())
+    deviceLibs = AMDGCNLibraries::All;
 }
 
 void SerializeGPUModuleBase::init() {
@@ -138,29 +136,67 @@ ArrayRef<std::string> SerializeGPUModuleBase::getFileList() const {
   return fileList;
 }
 
-LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
+LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
+  if (libs == AMDGCNLibraries::None)
+    return success();
   StringRef pathRef = getToolkitPath();
-  if (!pathRef.empty()) {
-    SmallVector<char, 256> path;
-    path.insert(path.begin(), pathRef.begin(), pathRef.end());
-    llvm::sys::path::append(path, "amdgcn", "bitcode");
-    pathRef = StringRef(path.data(), path.size());
-    if (!llvm::sys::fs::is_directory(pathRef)) {
-      getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
-                                  << " does not exist or is not a directory.";
-      return failure();
-    }
-    StringRef isaVersion =
-        llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
-    isaVersion.consume_front("gfx");
-    return getCommonBitcodeLibs(fileList, path, isaVersion);
+  // Fail if the toolkit is empty.
+  if (pathRef.empty())
+    return failure();
+
+  // Get the path for the device libraries
+  SmallString<256> path;
+  path.insert(path.begin(), pathRef.begin(), pathRef.end());
+  llvm::sys::path::append(path, "amdgcn", "bitcode");
+  pathRef = StringRef(path.data(), path.size());
+
+  // Fail if the path is invalid.
+  if (!llvm::sys::fs::is_directory(pathRef)) {
+    getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
+                                << " does not exist or is not a directory";
+    return failure();
   }
+
+  // Get the ISA version.
+  StringRef isaVersion =
+      llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
+  isaVersion.consume_front("gfx");
+
+  // Helper function for adding a library.
+  auto addLib = [&](const Twine &lib) -> bool {
+    auto baseSize = path.size();
+    llvm::sys::path::append(path, lib);
+    StringRef pathRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_regular_file(pathRef)) {
+      getOperation().emitRemark() << "bitcode library path: " << pathRef
+                                  << " does not exist or is not a file";
+      return true;
+    }
+    fileList.push_back(pathRef.str());
+    path.truncate(baseSize);
+    return false;
+  };
+
+  // Add ROCm device libraries. Fail if any of the libraries is not found, ie.
+  // if any of the `addLib` failed.
+  if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) ||
+      (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) ||
+      (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) ||
+      (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")) ||
+      (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl)) &&
+       addLib("oclc_isa_version_" + isaVersion + ".bc")))
+    return failure();
   return success();
 }
 
 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
+  // Return if there are no libs to load.
+  if (deviceLibs == AMDGCNLibraries::None && fileList.empty())
+    return bcFiles;
+  if (failed(appendStandardLibs(deviceLibs)))
+    return std::nullopt;
   if (failed(loadBitcodeFilesFromList(module.getContext(), fileList, bcFiles,
                                       true)))
     return std::nullopt;
@@ -174,80 +210,76 @@ LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
   // Stop spamming us with clang version numbers
   if (auto *ident = module.getNamedMetadata("llvm.ident"))
     module.eraseNamedMetadata(ident);
+  // Override the libModules datalayout and target triple with the compiler's
+  // data layout should there be a discrepency.
+  setDataLayoutAndTriple(module);
   return success();
 }
 
 void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
-  [[maybe_unused]] std::optional<llvm::TargetMachine *> targetMachine =
-      getOrCreateTargetMachine();
-  assert(targetMachine && "expect a TargetMachine");
-  addControlVariables(module, target.hasWave64(), target.hasDaz(),
+  // If all libraries are not set, traverse the module to determine which
+  // libraries are required.
+  if (deviceLibs != AMDGCNLibraries::All) {
+    for (llvm::Function &f : module.functions()) {
+      if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
+        StringRef funcName = f.getName();
+        if ("printf" == funcName)
+          deviceLibs |= AMDGCNLibraries::OpenCL | AMDGCNLibraries::Ockl |
+                        AMDGCNLibraries::Ocml;
+        if (funcName.starts_with("__ockl_"))
+          deviceLibs |= AMDGCNLibraries::Ockl;
+        if (funcName.starts_with("__ocml_"))
+          deviceLibs |= AMDGCNLibraries::Ocml;
+      }
+    }
+  }
+  addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
                       target.hasFiniteOnly(), target.hasUnsafeMath(),
                       target.hasFastMath(), target.hasCorrectSqrt(),
                       target.getAbi());
 }
 
-// Get the paths of ROCm device libraries.
-LogicalResult SerializeGPUModuleBase::getCommonBitcodeLibs(
-    llvm::SmallVector<std::string> &libs, SmallVector<char, 256> &libPath,
-    StringRef isaVersion) {
-  auto addLib = [&](StringRef path) -> bool {
-    if (!llvm::sys::fs::is_regular_file(path)) {
-      getOperation().emitRemark() << "Bitcode library path: " << path
-                                  << " does not exist or is not a file.\n";
-      return true;
-    }
-    libs.push_back(path.str());
-    return false;
-  };
-  auto getLibPath = [&libPath](Twine lib) {
-    auto baseSize = libPath.size();
-    llvm::sys::path::append(libPath, lib + ".bc");
-    std::string path(StringRef(libPath.data(), libPath.size()).str());
-    libPath.truncate(baseSize);
-    return path;
-  };
-
-  // Add ROCm device libraries. Fail if any of the libraries is not found.
-  if (addLib(getLibPath("ocml")) || addLib(getLibPath("ockl")) ||
-      addLib(getLibPath("hip")) || addLib(getLibPath("opencl")) ||
-      addLib(getLibPath("oclc_isa_version_" + isaVersion)))
-    return failure();
-  return success();
-}
-
 void SerializeGPUModuleBase::addControlVariables(
-    llvm::Module &module, bool wave64, bool daz, bool finiteOnly,
-    bool unsafeMath, bool fastMath, bool correctSqrt, StringRef abiVer) {
-  llvm::Type *i8Ty = llvm::Type::getInt8Ty(module.getContext());
-  auto addControlVariable = [i8Ty, &module](StringRef name, bool enable) {
+    llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
+    bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
+    StringRef abiVer) {
+  // Return if no device libraries are required.
+  if (libs == AMDGCNLibraries::None)
+    return;
+  // Helper function for adding control variables.
+  auto addControlVariable = [&module](StringRef name, uint32_t value,
+                                      uint32_t bitwidth) {
+    if (module.getNamedGlobal(name)) {
+      return;
+    }
+    llvm::IntegerType *type =
+        llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
     llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
-        module, i8Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-        llvm::ConstantInt::get(i8Ty, enable), name, nullptr,
-        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+        module, /*isConstant=*/type, true,
+        llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+        llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
+        /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
+        /*addressSpace=*/4);
     controlVariable->setVisibility(
         llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-    controlVariable->setAlignment(llvm::MaybeAlign(1));
+    controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
     controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   };
-  addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath);
-  addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath);
-  addControlVariable("__oclc_daz_opt", daz || fastMath);
-  addControlVariable("__oclc_correctly_rounded_sqrt32",
-                     correctSqrt && !fastMath);
-  addControlVariable("__oclc_wavefrontsize64", wave64);
-
-  llvm::Type *i32Ty = llvm::Type::getInt32Ty(module.getContext());
-  int abi = 500;
-  abiVer.getAsInteger(0, abi);
-  llvm::GlobalVariable *abiVersion = new llvm::GlobalVariable(
-      module, i32Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-      llvm::ConstantInt::get(i32Ty, abi), "__oclc_ABI_version", nullptr,
-      llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
-  abiVersion->setVisibility(
-      llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-  abiVersion->setAlignment(llvm::MaybeAlign(4));
-  abiVersion->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+  // Add ocml related control variables.
+  if (any(libs & AMDGCNLibraries::Ocml)) {
+    addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath, 8);
+    addControlVariable("__oclc_daz_opt", daz || fastMath, 8);
+    addControlVariable("__oclc_correctly_rounded_sqrt32",
+                       correctSqrt && !fastMath, 8);
+    addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath, 8);
+  }
+  // Add ocml or ockl related control variables.
+  if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) {
+    addControlVariable("__oclc_wavefrontsize64", wave64, 8);
+    int abi = 500;
+    abiVer.getAsInteger(0, abi);
+    addControlVariable("__oclc_ABI_version", abi, 32);
+  }
 }
 
 std::optional<SmallVector<char, 0>>
@@ -312,48 +344,16 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
 
   parser->setTargetParser(*tap);
   parser->Run(false);
-
   return result;
 }
 
-#if MLIR_ENABLE_ROCM_CONVERSIONS
-namespace {
-class AMDGPUSerializer : public SerializeGPUModuleBase {
-public:
-  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                   const gpu::TargetOptions &targetOptions);
-
-  gpu::GPUModuleOp getOperation();
-
-  // Compile to HSA.
-  std::optional<SmallVector<char, 0>>
-  compileToBinary(const std::string &serializedISA);
-
-  std::optional<SmallVector<char, 0>>
-  moduleToObject(llvm::Module &llvmModule) override;
-
-private:
-  // Target options.
-  gpu::TargetOptions targetOptions;
-};
-} // namespace
-
-AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                                   const gpu::TargetOptions &targetOptions)
-    : SerializeGPUModuleBase(module, target, targetOptions),
-      targetOptions(targetOptions) {}
-
-gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
-  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
-}
-
 std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
+SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   // Assemble the ISA.
   std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);
 
   if (!isaBinary) {
-    getOperation().emitError() << "Failed during ISA assembling.";
+    getOperation().emitError() << "failed during ISA assembling";
     return std::nullopt;
   }
 
@@ -363,7 +363,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
                                          tempIsaBinaryFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for dumping the ISA binary.";
+        << "failed to create a temporary file for dumping the ISA binary";
     return std::nullopt;
   }
   llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
@@ -378,7 +378,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
                                          tempHsacoFilename)) {
     getOperation().emitError()
-        << "Failed to create a temporary file for the HSA code object.";
+        << "failed to create a temporary file for the HSA code object";
     return std::nullopt;
   }
   llvm::FileRemover cleanupHsaco(tempHsacoFilename);
@@ -389,7 +389,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       lldPath,
       {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
   if (lldResult != 0) {
-    getOperation().emitError() << "lld invocation failed.";
+    getOperation().emitError() << "lld invocation failed";
     return std::nullopt;
   }
 
@@ -398,7 +398,7 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
       llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
   if (!hsacoFile) {
     getOperation().emitError()
-        << "Failed to read the HSA code object from the temp file.";
+        << "failed to read the HSA code object from the temp file";
     return std::nullopt;
   }
 
@@ -407,13 +407,13 @@ AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   return SmallVector<char, 0>(buffer.begin(), buffer.end());
 }
 
-std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
+    const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
   // Return LLVM IR if the compilation target is offload.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
-    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
-                 << "\n"
+    llvm::dbgs() << "LLVM IR for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
@@ -423,8 +423,8 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<llvm::TargetMachine *> targetMachine =
       getOrCreateTargetMachine();
   if (!targetMachine) {
-    getOperation().emitError() << "Target Machine unavailable for triple "
-                               << triple << ", can't compile with LLVM\n";
+    getOperation().emitError() << "target Machine unavailable for triple "
+                               << triple << ", can't compile with LLVM";
     return std::nullopt;
   }
 
@@ -432,12 +432,13 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   std::optional<std::string> serializedISA =
       translateToISA(llvmModule, **targetMachine);
   if (!serializedISA) {
-    getOperation().emitError() << "Failed translating the module to ISA.";
+    getOperation().emitError() << "failed translating the module to ISA";
     return std::nullopt;
   }
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
-    llvm::dbgs() << "ISA for module: " << getOperation().getNameAttr() << "\n"
+    llvm::dbgs() << "ISA for module: "
+                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
                  << *serializedISA << "\n";
   });
 #undef DEBUG_TYPE
@@ -445,9 +446,45 @@ AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
 
+  // Compiling to binary requires a valid ROCm path, fail if it's not found.
+  if (getToolkitPath().empty())
+    getOperation().emitError() << "invalid ROCm path, please set a valid path";
+
   // Compile to binary.
   return compileToBinary(*serializedISA);
 }
+
+#if MLIR_ENABLE_ROCM_CONVERSIONS
+namespace {
+class AMDGPUSerializer : public SerializeGPUModuleBase {
+public:
+  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                   const gpu::TargetOptions &targetOptions);
+
+  gpu::GPUModuleOp getOperation();
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  // Target options.
+  gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                                   const gpu::TargetOptions &targetOptions)
+    : SerializeGPUModuleBase(module, target, targetOptions),
+      targetOptions(targetOptions) {}
+
+gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
+  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
+}
+
+std::optional<SmallVector<char, 0>>
+AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
+  return moduleToObjectImpl(targetOptions, llvmModule);
+}
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 
 std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
@@ -457,7 +494,7 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   if (!module)
     return std::nullopt;
   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
-    module->emitError("Module must be a GPU module.");
+    module->emitError("module must be a GPU module");
     return std::nullopt;
   }
 #if MLIR_ENABLE_ROCM_CONVERSIONS
@@ -466,8 +503,8 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   serializer.init();
   return serializer.run();
 #else
-  module->emitError("The `AMDGPU` target was not built. Please enable it when "
-                    "building LLVM.");
+  module->emitError("the `AMDGPU` target was not built. Please enable it when "
+                    "building LLVM");
   return std::nullopt;
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 }
@@ -477,10 +514,15 @@ ROCDLTargetAttrImpl::createObject(Attribute attribute,
                                   const SmallVector<char, 0> &object,
                                   const gpu::TargetOptions &options) const {
   gpu::CompilationTarget format = options.getCompilationTarget();
+  // If format is `fatbin` transform it to binary as `fatbin` is not yet
+  // supported.
+  if (format > gpu::CompilationTarget::Binary)
+    format = gpu::CompilationTarget::Binary;
+
+  DictionaryAttr properties{};
   Builder builder(attribute.getContext());
   return builder.getAttr<gpu::ObjectAttr>(
-      attribute,
-      format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
-                                              : format,
-      builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
+      attribute, format,
+      builder.getStringAttr(StringRef(object.data(), object.size())),
+      properties);
 }

From 44ca65661e2cdd5636f592f573a2837e6ae948ba Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 17 Jun 2024 15:51:26 -0500
Subject: [PATCH 11/25] [libc] Only include getauxval on AARCH64 targets
 (#95844)

Summary:
Not all platforms support this function or header, but it was being
included by every test. Move it inside of the `ifdef` for the only user,
which is aarch64.
---
 libc/test/IntegrationTest/test.cpp       | 3 ++-
 libc/test/UnitTest/HermeticTestUtils.cpp | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/libc/test/IntegrationTest/test.cpp b/libc/test/IntegrationTest/test.cpp
index a8b2f2911fd8e..0c961dbafb840 100644
--- a/libc/test/IntegrationTest/test.cpp
+++ b/libc/test/IntegrationTest/test.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/common.h"
-#include "src/sys/auxv/getauxval.h"
 #include <stddef.h>
 #include <stdint.h>
 
@@ -83,6 +82,8 @@ void *realloc(void *ptr, size_t s) {
 void *__dso_handle = nullptr;
 
 #ifdef LIBC_TARGET_ARCH_IS_AARCH64
+#include "src/sys/auxv/getauxval.h"
+
 // Due to historical reasons, libgcc on aarch64 may expect __getauxval to be
 // defined. See also https://gcc.gnu.org/pipermail/gcc-cvs/2020-June/300635.html
 unsigned long __getauxval(unsigned long id) {
diff --git a/libc/test/UnitTest/HermeticTestUtils.cpp b/libc/test/UnitTest/HermeticTestUtils.cpp
index 6e815e6c8aab0..191e54b7344a6 100644
--- a/libc/test/UnitTest/HermeticTestUtils.cpp
+++ b/libc/test/UnitTest/HermeticTestUtils.cpp
@@ -7,7 +7,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/common.h"
-#include "src/sys/auxv/getauxval.h"
 #include <stddef.h>
 #include <stdint.h>
 
@@ -111,6 +110,8 @@ void __cxa_pure_virtual() {
 void *__dso_handle = nullptr;
 
 #ifdef LIBC_TARGET_ARCH_IS_AARCH64
+#include "src/sys/auxv/getauxval.h"
+
 // Due to historical reasons, libgcc on aarch64 may expect __getauxval to be
 // defined. See also https://gcc.gnu.org/pipermail/gcc-cvs/2020-June/300635.html
 unsigned long __getauxval(unsigned long id) {

From a50bcc03cbaecf6473c6bf41f4497758a7876f3d Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 17 Jun 2024 16:56:19 -0400
Subject: [PATCH 12/25] [libc][stdlib] Only add internal malloc in full build
 mode.  Use the system malloc in overlay mode. (#95845)

This causes an issue in overlay mode:
https://github.com/llvm/llvm-project/pull/95736#issuecomment-2172739705
---
 libc/src/stdlib/CMakeLists.txt | 26 +++++++++++++++++---------
 1 file changed, 17 insertions(+), 9 deletions(-)

diff --git a/libc/src/stdlib/CMakeLists.txt b/libc/src/stdlib/CMakeLists.txt
index e26c19f03f5ab..fdbf7b75e72f4 100644
--- a/libc/src/stdlib/CMakeLists.txt
+++ b/libc/src/stdlib/CMakeLists.txt
@@ -418,15 +418,23 @@ else()
       libc.src.string.memory_utils.inline_memcpy
       libc.src.string.memory_utils.inline_memset
   )
-  add_entrypoint_object(
-    malloc
-    SRCS
-      freelist_malloc.cpp
-    HDRS
-      malloc.h
-    DEPENDS
-      .freelist_heap
-  )
+  # Only add malloc in full build mode.  Use the system malloc in overlay mode.
+  if(LLVM_LIBC_FULL_BUILD)
+    add_entrypoint_object(
+      malloc
+      SRCS
+        freelist_malloc.cpp
+      HDRS
+        malloc.h
+      DEPENDS
+        .freelist_heap
+    )
+  else()
+    add_entrypoint_external(
+      malloc
+    )
+  endif()
+
   add_entrypoint_external(
     free
   )

From 16aa39ad94350670f4d72dace0a4866fbe10d716 Mon Sep 17 00:00:00 2001
From: Joshua Baehring <98630690+JoshuaMBa@users.noreply.github.com>
Date: Mon, 17 Jun 2024 14:06:31 -0700
Subject: [PATCH 13/25] [scudo] Update error handling for seondary cache entry
 count (#95595)

Initially, the scudo allocator would return an error if the user
attempted to set the cache capacity
(i.e. the number of possible entries in the cache) above the maximum
cache capacity.
Now the allocator will resort to using the maximum cache capacity in
this event.
An error will still be returned if the user attempts to set the number
of entries to a negative value.
---
 compiler-rt/lib/scudo/standalone/secondary.h              | 7 ++++---
 compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp | 6 +++---
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index d8c9f5bcfcaf6..9a8e53be388b7 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -391,10 +391,11 @@ template <typename Config> class MapAllocatorCache {
       return true;
     }
     if (O == Option::MaxCacheEntriesCount) {
-      const u32 MaxCount = static_cast<u32>(Value);
-      if (MaxCount > Config::getEntriesArraySize())
+      if (Value < 0)
         return false;
-      atomic_store_relaxed(&MaxEntriesCount, MaxCount);
+      atomic_store_relaxed(
+          &MaxEntriesCount,
+          Min<u32>(static_cast<u32>(Value), Config::getEntriesArraySize()));
       return true;
     }
     if (O == Option::MaxCacheEntrySize) {
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index 8f0250e88ebf3..af69313214ea6 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -192,9 +192,9 @@ TEST_F(MapAllocatorTest, SecondaryIterate) {
 
 TEST_F(MapAllocatorTest, SecondaryOptions) {
   // Attempt to set a maximum number of entries higher than the array size.
-  EXPECT_FALSE(
-      Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
-  // A negative number will be cast to a scudo::u32, and fail.
+  EXPECT_TRUE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, 4096U));
+
+  // Attempt to set an invalid (negative) number of entries
   EXPECT_FALSE(Allocator->setOption(scudo::Option::MaxCacheEntriesCount, -1));
   if (Allocator->canCache(0U)) {
     // Various valid combinations.

From 6037a698b919e0c8dbf39673d68835c49dc5130a Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 17 Jun 2024 14:18:55 -0700
Subject: [PATCH 14/25] [mlgo] inline for size: add bypass mechanism for
 perserving performance (#95616)

This allows shrinking for size the cold part of the code, without sacrificing performance.
---
 llvm/include/llvm/Analysis/MLInlineAdvisor.h  |  2 +
 llvm/lib/Analysis/MLInlineAdvisor.cpp         | 18 ++++-
 .../models/gen-inline-oz-test-model.py        | 14 +++-
 llvm/test/Transforms/Inline/ML/bypass.ll      | 78 +++++++++++++++++++
 4 files changed, 107 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/Transforms/Inline/ML/bypass.ll

diff --git a/llvm/include/llvm/Analysis/MLInlineAdvisor.h b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
index f58862e533529..2aa077fe0e035 100644
--- a/llvm/include/llvm/Analysis/MLInlineAdvisor.h
+++ b/llvm/include/llvm/Analysis/MLInlineAdvisor.h
@@ -13,6 +13,7 @@
 #include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/IR/PassManager.h"
 
 #include <deque>
@@ -89,6 +90,7 @@ class MLInlineAdvisor : public InlineAdvisor {
   llvm::SmallPtrSet<const LazyCallGraph::Node *, 1> NodesInLastSCC;
   DenseSet<const LazyCallGraph::Node *> AllNodes;
   bool ForceStop = false;
+  ProfileSummaryInfo &PSI;
 };
 
 /// InlineAdvice that tracks changes post inlining. For that reason, it only
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
index 75eb8ece2e447..21946572339b9 100644
--- a/llvm/lib/Analysis/MLInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/MLInlineAdvisor.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/FunctionPropertiesAnalysis.h"
 #include "llvm/Analysis/InlineCost.h"
@@ -23,6 +24,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MLModelRunner.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ReleaseModeModelRunner.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -46,6 +48,14 @@ static cl::opt<bool>
     InteractiveIncludeDefault("inliner-interactive-include-default", cl::Hidden,
                               cl::desc(InclDefaultMsg));
 
+enum class SkipMLPolicyCriteria { Never, IfCallerIsNotCold };
+
+static cl::opt<SkipMLPolicyCriteria> SkipPolicy(
+    "ml-inliner-skip-policy", cl::Hidden, cl::init(SkipMLPolicyCriteria::Never),
+    cl::values(clEnumValN(SkipMLPolicyCriteria::Never, "never", "never"),
+               clEnumValN(SkipMLPolicyCriteria::IfCallerIsNotCold,
+                          "if-caller-not-cold", "if the caller is not cold")));
+
 #if defined(LLVM_HAVE_TF_AOT_INLINERSIZEMODEL)
 // codegen-ed file
 #include "InlinerSizeModel.h" // NOLINT
@@ -129,7 +139,8 @@ MLInlineAdvisor::MLInlineAdvisor(
           M, MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
       ModelRunner(std::move(Runner)), GetDefaultAdvice(GetDefaultAdvice),
       CG(MAM.getResult<LazyCallGraphAnalysis>(M)),
-      InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize) {
+      InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize),
+      PSI(MAM.getResult<ProfileSummaryAnalysis>(M)) {
   assert(ModelRunner);
   ModelRunner->switchContext("");
   // Extract the 'call site height' feature - the position of a call site
@@ -334,6 +345,11 @@ std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdviceImpl(CallBase &CB) {
   auto &TIR = FAM.getResult<TargetIRAnalysis>(Callee);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
 
+  if (SkipPolicy == SkipMLPolicyCriteria::IfCallerIsNotCold) {
+    if (!PSI.isFunctionEntryCold(&Caller))
+      return std::make_unique<InlineAdvice>(this, CB, ORE,
+                                            GetDefaultAdvice(CB));
+  }
   auto MandatoryKind = InlineAdvisor::getMandatoryKind(CB, FAM, ORE);
   // If this is a "never inline" case, there won't be any changes to internal
   // state we need to track, so we can just return the base InlineAdvice, which
diff --git a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
index 4898509ea544f..83055890283e8 100644
--- a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
+++ b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
@@ -102,12 +102,12 @@ def get_output_spec_path(path):
     return os.path.join(path, "output_spec.json")
 
 
-def build_mock_model(path, signature):
+def build_mock_model(path, signature, advice):
     """Build and save the mock model with the given signature"""
     module = tf.Module()
 
     def action(*inputs):
-        return {signature["output"]: tf.constant(value=1, dtype=tf.int64)}
+        return {signature["output"]: tf.constant(value=advice, dtype=tf.int64)}
 
     module.action = tf.function()(action)
     action = {"action": module.action.get_concrete_function(signature["inputs"])}
@@ -128,12 +128,18 @@ def get_signature():
 
 
 def main(argv):
-    assert len(argv) == 2
+    assert len(argv) == 2 or (len(argv) == 3 and argv[2] == "never")
     model_path = argv[1]
 
     print(f"Output model to: [{argv[1]}]")
+
+    constant_advice = 1
+    if len(argv) == 3:
+        constant_advice = 0
+    print(f"The model will always return: {constant_advice}")
+
     signature = get_signature()
-    build_mock_model(model_path, signature)
+    build_mock_model(model_path, signature, constant_advice)
 
 
 if __name__ == "__main__":
diff --git a/llvm/test/Transforms/Inline/ML/bypass.ll b/llvm/test/Transforms/Inline/ML/bypass.ll
new file mode 100644
index 0000000000000..ccdefdcc93bfe
--- /dev/null
+++ b/llvm/test/Transforms/Inline/ML/bypass.ll
@@ -0,0 +1,78 @@
+; REQUIRES: have_tflite
+; RUN: rm -rf %t.runfiles %t.tflite %t.model_out
+; RUN: mkdir %t.runfiles
+; RUN: cp %S/../../../../lib/Analysis/models/gen-inline-oz-test-model.py %t.runfiles
+; RUN: cp %S/../../../../lib/Analysis/models/saved-model-to-tflite.py %t.runfiles
+; RUN: %python %t.runfiles/gen-inline-oz-test-model.py %t.model_out never
+; RUN: %python %t.runfiles/saved-model-to-tflite.py %t.model_out %t.tflite
+
+; When running O2, we expect both callers to inline callee.
+; RUN: opt < %s -passes='default<O2>' -inline-threshold=0 -hot-callsite-threshold=100 -S | FileCheck %s --check-prefixes=O2-HOT,O2-COLD
+
+; The ML model we use always blocks inlining (by construction)
+; RUN: opt < %s -passes='default<O2>' -inline-threshold=0 -hot-callsite-threshold=100 \
+; RUN:  -enable-ml-inliner=development -ml-inliner-model-under-training=%t.tflite \
+; RUN:  -S | FileCheck %s --check-prefixes=ML-HOT,ML-COLD
+
+; When bypassing ML for non-cold callers, the hot caller will have its callee inlined, but the cold one won't
+; RUN: opt < %s -passes='default<O2>' -inline-threshold=0 -hot-callsite-threshold=100 \
+; RUN:  -enable-ml-inliner=development -ml-inliner-model-under-training=%t.tflite \
+; RUN: -ml-inliner-skip-policy=if-caller-not-cold -S | FileCheck %s --check-prefixes=O2-HOT,ML-COLD
+
+declare void @extern()
+
+define i32 @callee(i32 %x) {
+  %x1 = add i32 %x, 1
+  %x2 = add i32 %x1, 1
+  %x3 = add i32 %x2, 1
+  call void @extern()
+  call void @extern()
+  ret i32 %x3
+}
+
+define i32 @hot_caller(i32 %y1) !prof !15 {
+  %y = call i32 @callee(i32 %y1), !prof !16
+  ret i32 %y
+}
+
+define i32 @cold_caller(i32 %y1) !prof !17 {
+  %y = call i32 @callee(i32 %y1), !prof !16
+  ret i32 %y
+}
+
+
+!llvm.module.flags = !{!1}
+!15 = !{!"function_entry_count", i64 300}
+!16 = !{!"branch_weights", i64 300}
+!17 = !{!"function_entry_count", i64 1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"SampleProfile"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 100, i32 1}
+!13 = !{i32 999000, i64 100, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
+
+; O2-HOT-LABEL: @hot_caller
+; O2-HOT-NOT: call i32 @callee
+; O2-HOT: call void @extern
+; O2-HOT-NEXT: call void @extern
+; O2-HOT-NEXT: ret
+; O2-COLD-LABEL: @cold_caller
+; O2-COLD-NOT: call i32 @callee
+; O2-COLD: call void @extern
+; O2-COLD-NEXT: call void @extern
+; O2-COLD-NEXT: ret
+
+; ML-HOT-LABEL: @hot_caller
+; ML-HOT-NEXT: call i32 @callee
+; ML-COLD-LABEL: @cold_caller
+; ML-COLD-NEXT: call i32 @callee
\ No newline at end of file

From 3a2f7d8a9f84db380af5122418098cb28a57443f Mon Sep 17 00:00:00 2001
From: Fabian Mora <fmora.dev@gmail.com>
Date: Mon, 17 Jun 2024 16:19:21 -0500
Subject: [PATCH 15/25] Revert "Reland [mlir][Target] Improve ROCDL gpu
 serialization API" (#95847)

Reverts llvm/llvm-project#95813
---
 mlir/include/mlir/Target/LLVM/ROCDL/Utils.h |  41 +--
 mlir/lib/Dialect/GPU/CMakeLists.txt         |   2 +-
 mlir/lib/Target/LLVM/CMakeLists.txt         |  13 +-
 mlir/lib/Target/LLVM/ROCDL/Target.cpp       | 306 +++++++++-----------
 4 files changed, 149 insertions(+), 213 deletions(-)

diff --git a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
index 44c9ded317fa5..374fa65bd02e3 100644
--- a/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
+++ b/mlir/include/mlir/Target/LLVM/ROCDL/Utils.h
@@ -27,19 +27,6 @@ namespace ROCDL {
 /// 5. Returns an empty string.
 StringRef getROCMPath();
 
-/// Helper enum for specifying the AMD GCN device libraries required for
-/// compilation.
-enum class AMDGCNLibraries : uint32_t {
-  None = 0,
-  Ockl = 1,
-  Ocml = 2,
-  OpenCL = 4,
-  Hip = 8,
-  LastLib = Hip,
-  LLVM_MARK_AS_BITMASK_ENUM(LastLib),
-  All = (LastLib << 1) - 1
-};
-
 /// Base class for all ROCDL serializations from GPU modules into binary
 /// strings. By default this class serializes into LLVM bitcode.
 class SerializeGPUModuleBase : public LLVM::ModuleToObject {
@@ -62,8 +49,8 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   /// Returns the bitcode files to be loaded.
   ArrayRef<std::string> getFileList() const;
 
-  /// Appends standard ROCm device libraries to `fileList`.
-  LogicalResult appendStandardLibs(AMDGCNLibraries libs);
+  /// Appends standard ROCm device libraries like `ocml.bc`, `ockl.bc`, etc.
+  LogicalResult appendStandardLibs();
 
   /// Loads the bitcode files in `fileList`.
   virtual std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
@@ -76,20 +63,15 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
   LogicalResult handleBitcodeFile(llvm::Module &module) override;
 
 protected:
-  /// Adds `oclc` control variables to the LLVM module.
-  void addControlVariables(llvm::Module &module, AMDGCNLibraries libs,
-                           bool wave64, bool daz, bool finiteOnly,
-                           bool unsafeMath, bool fastMath, bool correctSqrt,
-                           StringRef abiVer);
+  /// Appends the paths of common ROCm device libraries to `libs`.
+  LogicalResult getCommonBitcodeLibs(llvm::SmallVector<std::string> &libs,
+                                     SmallVector<char, 256> &libPath,
+                                     StringRef isaVersion);
 
-  /// Compiles assembly to a binary.
-  virtual std::optional<SmallVector<char, 0>>
-  compileToBinary(const std::string &serializedISA);
-
-  /// Default implementation of `ModuleToObject::moduleToObject`.
-  std::optional<SmallVector<char, 0>>
-  moduleToObjectImpl(const gpu::TargetOptions &targetOptions,
-                     llvm::Module &llvmModule);
+  /// Adds `oclc` control variables to the LLVM module.
+  void addControlVariables(llvm::Module &module, bool wave64, bool daz,
+                           bool finiteOnly, bool unsafeMath, bool fastMath,
+                           bool correctSqrt, StringRef abiVer);
 
   /// Returns the assembled ISA.
   std::optional<SmallVector<char, 0>> assembleIsa(StringRef isa);
@@ -102,9 +84,6 @@ class SerializeGPUModuleBase : public LLVM::ModuleToObject {
 
   /// List of LLVM bitcode files to link to.
   SmallVector<std::string> fileList;
-
-  /// AMD GCN libraries to use when linking, the default is using none.
-  AMDGCNLibraries deviceLibs = AMDGCNLibraries::None;
 };
 } // namespace ROCDL
 } // namespace mlir
diff --git a/mlir/lib/Dialect/GPU/CMakeLists.txt b/mlir/lib/Dialect/GPU/CMakeLists.txt
index 08c8aea36fac9..61ab298ebfb98 100644
--- a/mlir/lib/Dialect/GPU/CMakeLists.txt
+++ b/mlir/lib/Dialect/GPU/CMakeLists.txt
@@ -106,7 +106,7 @@ if(MLIR_ENABLE_ROCM_CONVERSIONS)
       "Building mlir with ROCm support requires the AMDGPU backend")
   endif()
 
-  set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
+  set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
   target_compile_definitions(obj.MLIRGPUTransforms
     PRIVATE
     __DEFAULT_ROCM_PATH__="${DEFAULT_ROCM_PATH}"
diff --git a/mlir/lib/Target/LLVM/CMakeLists.txt b/mlir/lib/Target/LLVM/CMakeLists.txt
index 6e146710d67af..5a3fa160850b4 100644
--- a/mlir/lib/Target/LLVM/CMakeLists.txt
+++ b/mlir/lib/Target/LLVM/CMakeLists.txt
@@ -123,18 +123,17 @@ add_mlir_dialect_library(MLIRROCDLTarget
   )
 
 if(MLIR_ENABLE_ROCM_CONVERSIONS)
+  if (NOT ("AMDGPU" IN_LIST LLVM_TARGETS_TO_BUILD))
+    message(SEND_ERROR
+      "Building mlir with ROCm support requires the AMDGPU backend")
+  endif()
+
   if (DEFINED ROCM_PATH)
     set(DEFAULT_ROCM_PATH "${ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   elseif(DEFINED ENV{ROCM_PATH})
     set(DEFAULT_ROCM_PATH "$ENV{ROCM_PATH}" CACHE PATH "Fallback path to search for ROCm installs")
   else()
-    IF (WIN32)
-      # Avoid setting an UNIX path for Windows.
-      # TODO: Eventually migrate to FindHIP once it becomes a part of CMake.
-      set(DEFAULT_ROCM_PATH "" CACHE PATH "Fallback path to search for ROCm installs")
-    else()
-      set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
-    endif()
+    set(DEFAULT_ROCM_PATH "/opt/rocm" CACHE PATH "Fallback path to search for ROCm installs")
   endif()
   message(VERBOSE "MLIR Default ROCM toolkit path: ${DEFAULT_ROCM_PATH}")
 
diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index 6784f3668bde3..cc13e5b7436ea 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -17,6 +17,9 @@
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/Support/FileUtilities.h"
 #include "mlir/Target/LLVM/ROCDL/Utils.h"
+#include "mlir/Target/LLVMIR/Dialect/GPU/GPUToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.h"
+#include "mlir/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.h"
 #include "mlir/Target/LLVMIR/Export.h"
 
 #include "llvm/IR/Constants.h"
@@ -109,9 +112,8 @@ SerializeGPUModuleBase::SerializeGPUModuleBase(
       if (auto file = dyn_cast<StringAttr>(attr))
         fileList.push_back(file.str());
 
-  // By default add all libraries if the toolkit path is not empty.
-  if (!getToolkitPath().empty())
-    deviceLibs = AMDGCNLibraries::All;
+  // Append standard ROCm device bitcode libraries to the files to be loaded.
+  (void)appendStandardLibs();
 }
 
 void SerializeGPUModuleBase::init() {
@@ -136,67 +138,29 @@ ArrayRef<std::string> SerializeGPUModuleBase::getFileList() const {
   return fileList;
 }
 
-LogicalResult SerializeGPUModuleBase::appendStandardLibs(AMDGCNLibraries libs) {
-  if (libs == AMDGCNLibraries::None)
-    return success();
+LogicalResult SerializeGPUModuleBase::appendStandardLibs() {
   StringRef pathRef = getToolkitPath();
-  // Fail if the toolkit is empty.
-  if (pathRef.empty())
-    return failure();
-
-  // Get the path for the device libraries
-  SmallString<256> path;
-  path.insert(path.begin(), pathRef.begin(), pathRef.end());
-  llvm::sys::path::append(path, "amdgcn", "bitcode");
-  pathRef = StringRef(path.data(), path.size());
-
-  // Fail if the path is invalid.
-  if (!llvm::sys::fs::is_directory(pathRef)) {
-    getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
-                                << " does not exist or is not a directory";
-    return failure();
-  }
-
-  // Get the ISA version.
-  StringRef isaVersion =
-      llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
-  isaVersion.consume_front("gfx");
-
-  // Helper function for adding a library.
-  auto addLib = [&](const Twine &lib) -> bool {
-    auto baseSize = path.size();
-    llvm::sys::path::append(path, lib);
-    StringRef pathRef(path.data(), path.size());
-    if (!llvm::sys::fs::is_regular_file(pathRef)) {
-      getOperation().emitRemark() << "bitcode library path: " << pathRef
-                                  << " does not exist or is not a file";
-      return true;
+  if (!pathRef.empty()) {
+    SmallVector<char, 256> path;
+    path.insert(path.begin(), pathRef.begin(), pathRef.end());
+    llvm::sys::path::append(path, "amdgcn", "bitcode");
+    pathRef = StringRef(path.data(), path.size());
+    if (!llvm::sys::fs::is_directory(pathRef)) {
+      getOperation().emitRemark() << "ROCm amdgcn bitcode path: " << pathRef
+                                  << " does not exist or is not a directory.";
+      return failure();
     }
-    fileList.push_back(pathRef.str());
-    path.truncate(baseSize);
-    return false;
-  };
-
-  // Add ROCm device libraries. Fail if any of the libraries is not found, ie.
-  // if any of the `addLib` failed.
-  if ((any(libs & AMDGCNLibraries::Ocml) && addLib("ocml.bc")) ||
-      (any(libs & AMDGCNLibraries::Ockl) && addLib("ockl.bc")) ||
-      (any(libs & AMDGCNLibraries::Hip) && addLib("hip.bc")) ||
-      (any(libs & AMDGCNLibraries::OpenCL) && addLib("opencl.bc")) ||
-      (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl)) &&
-       addLib("oclc_isa_version_" + isaVersion + ".bc")))
-    return failure();
+    StringRef isaVersion =
+        llvm::AMDGPU::getArchNameAMDGCN(llvm::AMDGPU::parseArchAMDGCN(chip));
+    isaVersion.consume_front("gfx");
+    return getCommonBitcodeLibs(fileList, path, isaVersion);
+  }
   return success();
 }
 
 std::optional<SmallVector<std::unique_ptr<llvm::Module>>>
 SerializeGPUModuleBase::loadBitcodeFiles(llvm::Module &module) {
   SmallVector<std::unique_ptr<llvm::Module>> bcFiles;
-  // Return if there are no libs to load.
-  if (deviceLibs == AMDGCNLibraries::None && fileList.empty())
-    return bcFiles;
-  if (failed(appendStandardLibs(deviceLibs)))
-    return std::nullopt;
   if (failed(loadBitcodeFilesFromList(module.getContext(), fileList, bcFiles,
                                       true)))
     return std::nullopt;
@@ -210,76 +174,80 @@ LogicalResult SerializeGPUModuleBase::handleBitcodeFile(llvm::Module &module) {
   // Stop spamming us with clang version numbers
   if (auto *ident = module.getNamedMetadata("llvm.ident"))
     module.eraseNamedMetadata(ident);
-  // Override the libModules datalayout and target triple with the compiler's
-  // data layout should there be a discrepency.
-  setDataLayoutAndTriple(module);
   return success();
 }
 
 void SerializeGPUModuleBase::handleModulePreLink(llvm::Module &module) {
-  // If all libraries are not set, traverse the module to determine which
-  // libraries are required.
-  if (deviceLibs != AMDGCNLibraries::All) {
-    for (llvm::Function &f : module.functions()) {
-      if (f.hasExternalLinkage() && f.hasName() && !f.hasExactDefinition()) {
-        StringRef funcName = f.getName();
-        if ("printf" == funcName)
-          deviceLibs |= AMDGCNLibraries::OpenCL | AMDGCNLibraries::Ockl |
-                        AMDGCNLibraries::Ocml;
-        if (funcName.starts_with("__ockl_"))
-          deviceLibs |= AMDGCNLibraries::Ockl;
-        if (funcName.starts_with("__ocml_"))
-          deviceLibs |= AMDGCNLibraries::Ocml;
-      }
-    }
-  }
-  addControlVariables(module, deviceLibs, target.hasWave64(), target.hasDaz(),
+  [[maybe_unused]] std::optional<llvm::TargetMachine *> targetMachine =
+      getOrCreateTargetMachine();
+  assert(targetMachine && "expect a TargetMachine");
+  addControlVariables(module, target.hasWave64(), target.hasDaz(),
                       target.hasFiniteOnly(), target.hasUnsafeMath(),
                       target.hasFastMath(), target.hasCorrectSqrt(),
                       target.getAbi());
 }
 
-void SerializeGPUModuleBase::addControlVariables(
-    llvm::Module &module, AMDGCNLibraries libs, bool wave64, bool daz,
-    bool finiteOnly, bool unsafeMath, bool fastMath, bool correctSqrt,
-    StringRef abiVer) {
-  // Return if no device libraries are required.
-  if (libs == AMDGCNLibraries::None)
-    return;
-  // Helper function for adding control variables.
-  auto addControlVariable = [&module](StringRef name, uint32_t value,
-                                      uint32_t bitwidth) {
-    if (module.getNamedGlobal(name)) {
-      return;
+// Get the paths of ROCm device libraries.
+LogicalResult SerializeGPUModuleBase::getCommonBitcodeLibs(
+    llvm::SmallVector<std::string> &libs, SmallVector<char, 256> &libPath,
+    StringRef isaVersion) {
+  auto addLib = [&](StringRef path) -> bool {
+    if (!llvm::sys::fs::is_regular_file(path)) {
+      getOperation().emitRemark() << "Bitcode library path: " << path
+                                  << " does not exist or is not a file.\n";
+      return true;
     }
-    llvm::IntegerType *type =
-        llvm::IntegerType::getIntNTy(module.getContext(), bitwidth);
+    libs.push_back(path.str());
+    return false;
+  };
+  auto getLibPath = [&libPath](Twine lib) {
+    auto baseSize = libPath.size();
+    llvm::sys::path::append(libPath, lib + ".bc");
+    std::string path(StringRef(libPath.data(), libPath.size()).str());
+    libPath.truncate(baseSize);
+    return path;
+  };
+
+  // Add ROCm device libraries. Fail if any of the libraries is not found.
+  if (addLib(getLibPath("ocml")) || addLib(getLibPath("ockl")) ||
+      addLib(getLibPath("hip")) || addLib(getLibPath("opencl")) ||
+      addLib(getLibPath("oclc_isa_version_" + isaVersion)))
+    return failure();
+  return success();
+}
+
+void SerializeGPUModuleBase::addControlVariables(
+    llvm::Module &module, bool wave64, bool daz, bool finiteOnly,
+    bool unsafeMath, bool fastMath, bool correctSqrt, StringRef abiVer) {
+  llvm::Type *i8Ty = llvm::Type::getInt8Ty(module.getContext());
+  auto addControlVariable = [i8Ty, &module](StringRef name, bool enable) {
     llvm::GlobalVariable *controlVariable = new llvm::GlobalVariable(
-        module, /*isConstant=*/type, true,
-        llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
-        llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
-        /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
-        /*addressSpace=*/4);
+        module, i8Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+        llvm::ConstantInt::get(i8Ty, enable), name, nullptr,
+        llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
     controlVariable->setVisibility(
         llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
-    controlVariable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
+    controlVariable->setAlignment(llvm::MaybeAlign(1));
     controlVariable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
   };
-  // Add ocml related control variables.
-  if (any(libs & AMDGCNLibraries::Ocml)) {
-    addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath, 8);
-    addControlVariable("__oclc_daz_opt", daz || fastMath, 8);
-    addControlVariable("__oclc_correctly_rounded_sqrt32",
-                       correctSqrt && !fastMath, 8);
-    addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath, 8);
-  }
-  // Add ocml or ockl related control variables.
-  if (any(libs & (AMDGCNLibraries::Ocml | AMDGCNLibraries::Ockl))) {
-    addControlVariable("__oclc_wavefrontsize64", wave64, 8);
-    int abi = 500;
-    abiVer.getAsInteger(0, abi);
-    addControlVariable("__oclc_ABI_version", abi, 32);
-  }
+  addControlVariable("__oclc_finite_only_opt", finiteOnly || fastMath);
+  addControlVariable("__oclc_unsafe_math_opt", unsafeMath || fastMath);
+  addControlVariable("__oclc_daz_opt", daz || fastMath);
+  addControlVariable("__oclc_correctly_rounded_sqrt32",
+                     correctSqrt && !fastMath);
+  addControlVariable("__oclc_wavefrontsize64", wave64);
+
+  llvm::Type *i32Ty = llvm::Type::getInt32Ty(module.getContext());
+  int abi = 500;
+  abiVer.getAsInteger(0, abi);
+  llvm::GlobalVariable *abiVersion = new llvm::GlobalVariable(
+      module, i32Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+      llvm::ConstantInt::get(i32Ty, abi), "__oclc_ABI_version", nullptr,
+      llvm::GlobalValue::ThreadLocalMode::NotThreadLocal, 4);
+  abiVersion->setVisibility(
+      llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
+  abiVersion->setAlignment(llvm::MaybeAlign(4));
+  abiVersion->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
 }
 
 std::optional<SmallVector<char, 0>>
@@ -344,16 +312,48 @@ SerializeGPUModuleBase::assembleIsa(StringRef isa) {
 
   parser->setTargetParser(*tap);
   parser->Run(false);
+
   return result;
 }
 
+#if MLIR_ENABLE_ROCM_CONVERSIONS
+namespace {
+class AMDGPUSerializer : public SerializeGPUModuleBase {
+public:
+  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                   const gpu::TargetOptions &targetOptions);
+
+  gpu::GPUModuleOp getOperation();
+
+  // Compile to HSA.
+  std::optional<SmallVector<char, 0>>
+  compileToBinary(const std::string &serializedISA);
+
+  std::optional<SmallVector<char, 0>>
+  moduleToObject(llvm::Module &llvmModule) override;
+
+private:
+  // Target options.
+  gpu::TargetOptions targetOptions;
+};
+} // namespace
+
+AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
+                                   const gpu::TargetOptions &targetOptions)
+    : SerializeGPUModuleBase(module, target, targetOptions),
+      targetOptions(targetOptions) {}
+
+gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
+  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
+}
+
 std::optional<SmallVector<char, 0>>
-SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
+AMDGPUSerializer::compileToBinary(const std::string &serializedISA) {
   // Assemble the ISA.
   std::optional<SmallVector<char, 0>> isaBinary = assembleIsa(serializedISA);
 
   if (!isaBinary) {
-    getOperation().emitError() << "failed during ISA assembling";
+    getOperation().emitError() << "Failed during ISA assembling.";
     return std::nullopt;
   }
 
@@ -363,7 +363,7 @@ SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel%%", "o", tempIsaBinaryFd,
                                          tempIsaBinaryFilename)) {
     getOperation().emitError()
-        << "failed to create a temporary file for dumping the ISA binary";
+        << "Failed to create a temporary file for dumping the ISA binary.";
     return std::nullopt;
   }
   llvm::FileRemover cleanupIsaBinary(tempIsaBinaryFilename);
@@ -378,7 +378,7 @@ SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   if (llvm::sys::fs::createTemporaryFile("kernel", "hsaco",
                                          tempHsacoFilename)) {
     getOperation().emitError()
-        << "failed to create a temporary file for the HSA code object";
+        << "Failed to create a temporary file for the HSA code object.";
     return std::nullopt;
   }
   llvm::FileRemover cleanupHsaco(tempHsacoFilename);
@@ -389,7 +389,7 @@ SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
       lldPath,
       {"ld.lld", "-shared", tempIsaBinaryFilename, "-o", tempHsacoFilename});
   if (lldResult != 0) {
-    getOperation().emitError() << "lld invocation failed";
+    getOperation().emitError() << "lld invocation failed.";
     return std::nullopt;
   }
 
@@ -398,7 +398,7 @@ SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
       llvm::MemoryBuffer::getFile(tempHsacoFilename, /*IsText=*/false);
   if (!hsacoFile) {
     getOperation().emitError()
-        << "failed to read the HSA code object from the temp file";
+        << "Failed to read the HSA code object from the temp file.";
     return std::nullopt;
   }
 
@@ -407,13 +407,13 @@ SerializeGPUModuleBase::compileToBinary(const std::string &serializedISA) {
   return SmallVector<char, 0>(buffer.begin(), buffer.end());
 }
 
-std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
-    const gpu::TargetOptions &targetOptions, llvm::Module &llvmModule) {
+std::optional<SmallVector<char, 0>>
+AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
   // Return LLVM IR if the compilation target is offload.
 #define DEBUG_TYPE "serialize-to-llvm"
   LLVM_DEBUG({
-    llvm::dbgs() << "LLVM IR for module: "
-                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
+    llvm::dbgs() << "LLVM IR for module: " << getOperation().getNameAttr()
+                 << "\n"
                  << llvmModule << "\n";
   });
 #undef DEBUG_TYPE
@@ -423,8 +423,8 @@ std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
   std::optional<llvm::TargetMachine *> targetMachine =
       getOrCreateTargetMachine();
   if (!targetMachine) {
-    getOperation().emitError() << "target Machine unavailable for triple "
-                               << triple << ", can't compile with LLVM";
+    getOperation().emitError() << "Target Machine unavailable for triple "
+                               << triple << ", can't compile with LLVM\n";
     return std::nullopt;
   }
 
@@ -432,13 +432,12 @@ std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
   std::optional<std::string> serializedISA =
       translateToISA(llvmModule, **targetMachine);
   if (!serializedISA) {
-    getOperation().emitError() << "failed translating the module to ISA";
+    getOperation().emitError() << "Failed translating the module to ISA.";
     return std::nullopt;
   }
 #define DEBUG_TYPE "serialize-to-isa"
   LLVM_DEBUG({
-    llvm::dbgs() << "ISA for module: "
-                 << cast<gpu::GPUModuleOp>(getOperation()).getNameAttr() << "\n"
+    llvm::dbgs() << "ISA for module: " << getOperation().getNameAttr() << "\n"
                  << *serializedISA << "\n";
   });
 #undef DEBUG_TYPE
@@ -446,45 +445,9 @@ std::optional<SmallVector<char, 0>> SerializeGPUModuleBase::moduleToObjectImpl(
   if (targetOptions.getCompilationTarget() == gpu::CompilationTarget::Assembly)
     return SmallVector<char, 0>(serializedISA->begin(), serializedISA->end());
 
-  // Compiling to binary requires a valid ROCm path, fail if it's not found.
-  if (getToolkitPath().empty())
-    getOperation().emitError() << "invalid ROCm path, please set a valid path";
-
   // Compile to binary.
   return compileToBinary(*serializedISA);
 }
-
-#if MLIR_ENABLE_ROCM_CONVERSIONS
-namespace {
-class AMDGPUSerializer : public SerializeGPUModuleBase {
-public:
-  AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                   const gpu::TargetOptions &targetOptions);
-
-  gpu::GPUModuleOp getOperation();
-
-  std::optional<SmallVector<char, 0>>
-  moduleToObject(llvm::Module &llvmModule) override;
-
-private:
-  // Target options.
-  gpu::TargetOptions targetOptions;
-};
-} // namespace
-
-AMDGPUSerializer::AMDGPUSerializer(Operation &module, ROCDLTargetAttr target,
-                                   const gpu::TargetOptions &targetOptions)
-    : SerializeGPUModuleBase(module, target, targetOptions),
-      targetOptions(targetOptions) {}
-
-gpu::GPUModuleOp AMDGPUSerializer::getOperation() {
-  return dyn_cast<gpu::GPUModuleOp>(&SerializeGPUModuleBase::getOperation());
-}
-
-std::optional<SmallVector<char, 0>>
-AMDGPUSerializer::moduleToObject(llvm::Module &llvmModule) {
-  return moduleToObjectImpl(targetOptions, llvmModule);
-}
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 
 std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
@@ -494,7 +457,7 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   if (!module)
     return std::nullopt;
   if (!mlir::isa<gpu::GPUModuleOp>(module)) {
-    module->emitError("module must be a GPU module");
+    module->emitError("Module must be a GPU module.");
     return std::nullopt;
   }
 #if MLIR_ENABLE_ROCM_CONVERSIONS
@@ -503,8 +466,8 @@ std::optional<SmallVector<char, 0>> ROCDLTargetAttrImpl::serializeToObject(
   serializer.init();
   return serializer.run();
 #else
-  module->emitError("the `AMDGPU` target was not built. Please enable it when "
-                    "building LLVM");
+  module->emitError("The `AMDGPU` target was not built. Please enable it when "
+                    "building LLVM.");
   return std::nullopt;
 #endif // MLIR_ENABLE_ROCM_CONVERSIONS
 }
@@ -514,15 +477,10 @@ ROCDLTargetAttrImpl::createObject(Attribute attribute,
                                   const SmallVector<char, 0> &object,
                                   const gpu::TargetOptions &options) const {
   gpu::CompilationTarget format = options.getCompilationTarget();
-  // If format is `fatbin` transform it to binary as `fatbin` is not yet
-  // supported.
-  if (format > gpu::CompilationTarget::Binary)
-    format = gpu::CompilationTarget::Binary;
-
-  DictionaryAttr properties{};
   Builder builder(attribute.getContext());
   return builder.getAttr<gpu::ObjectAttr>(
-      attribute, format,
-      builder.getStringAttr(StringRef(object.data(), object.size())),
-      properties);
+      attribute,
+      format > gpu::CompilationTarget::Binary ? gpu::CompilationTarget::Binary
+                                              : format,
+      builder.getStringAttr(StringRef(object.data(), object.size())), nullptr);
 }

From 1adf0fae0503fccd1216865dd0276c3ec97f62a7 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 17 Jun 2024 14:38:26 -0700
Subject: [PATCH 16/25] Revert "[HWASan] [compiler-rt] support non-4k pages on
 Android" (#95853)

Reverts llvm/llvm-project#95069

Broke windows bot
---
 compiler-rt/lib/asan/asan_linux.cpp             |  3 +--
 compiler-rt/lib/asan/asan_mac.cpp               |  3 +--
 compiler-rt/lib/asan/asan_premap_shadow.cpp     |  3 +--
 .../lib/hwasan/hwasan_dynamic_shadow.cpp        | 17 ++++++-----------
 compiler-rt/lib/memprof/memprof_linux.cpp       |  3 +--
 .../lib/sanitizer_common/sanitizer_common.h     | 17 +++++++++--------
 .../lib/sanitizer_common/sanitizer_linux.cpp    |  4 ++--
 .../sanitizer_linux_libcdep.cpp                 |  5 +++--
 .../lib/sanitizer_common/sanitizer_mac.cpp      |  4 ++--
 .../lib/sanitizer_common/sanitizer_win.cpp      |  5 +++--
 10 files changed, 29 insertions(+), 35 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_linux.cpp b/compiler-rt/lib/asan/asan_linux.cpp
index 0b470db86748f..a517de5af00dc 100644
--- a/compiler-rt/lib/asan/asan_linux.cpp
+++ b/compiler-rt/lib/asan/asan_linux.cpp
@@ -93,8 +93,7 @@ uptr FindDynamicShadowStart() {
 #  endif
 
   return MapDynamicShadow(shadow_size_bytes, ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
-                          GetMmapGranularity());
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
 void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
diff --git a/compiler-rt/lib/asan/asan_mac.cpp b/compiler-rt/lib/asan/asan_mac.cpp
index bfc349223258b..b250f796e165f 100644
--- a/compiler-rt/lib/asan/asan_mac.cpp
+++ b/compiler-rt/lib/asan/asan_mac.cpp
@@ -51,8 +51,7 @@ bool IsSystemHeapAddress (uptr addr) { return false; }
 
 uptr FindDynamicShadowStart() {
   return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
-                          GetMmapGranularity());
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
 // No-op. Mac does not support static linkage anyway.
diff --git a/compiler-rt/lib/asan/asan_premap_shadow.cpp b/compiler-rt/lib/asan/asan_premap_shadow.cpp
index 6e08b8f966507..bed2f62a22511 100644
--- a/compiler-rt/lib/asan/asan_premap_shadow.cpp
+++ b/compiler-rt/lib/asan/asan_premap_shadow.cpp
@@ -33,8 +33,7 @@ uptr PremapShadowSize() {
 // PremapShadowSize() bytes on the right of it are mapped r/o.
 uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), /*mmap_alignment_scale*/ 3,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
-                          GetMmapGranularity());
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
 bool PremapShadowFailed() {
diff --git a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
index 48bc3b631ac07..7642ba6c0bf08 100644
--- a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
@@ -36,20 +36,15 @@ decltype(__hwasan_shadow)* __hwasan_premap_shadow();
 
 namespace __hwasan {
 
-// We cannot call anything in libc here (see comment above), so we need to
-// assume the biggest allowed page size.
-// Android max page size is defined as 16k here:
-// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/page.h#41
-static constexpr uptr kMaxGranularity = 16384;
-
 // Conservative upper limit.
 static uptr PremapShadowSize() {
-  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale, kMaxGranularity);
+  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale,
+                   GetMmapGranularity());
 }
 
 static uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), kShadowScale,
-                          kShadowBaseAlignment, kHighMemEnd, kMaxGranularity);
+                          kShadowBaseAlignment, kHighMemEnd);
 }
 
 static bool IsPremapShadowAvailable() {
@@ -61,7 +56,7 @@ static bool IsPremapShadowAvailable() {
 }
 
 static uptr FindPremappedShadowStart(uptr shadow_size_bytes) {
-  const uptr granularity = kMaxGranularity;
+  const uptr granularity = GetMmapGranularity();
   const uptr shadow_start = reinterpret_cast<uptr>(&__hwasan_shadow);
   const uptr premap_shadow_size = PremapShadowSize();
   const uptr shadow_size = RoundUpTo(shadow_size_bytes, granularity);
@@ -114,7 +109,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
   if (IsPremapShadowAvailable())
     return FindPremappedShadowStart(shadow_size_bytes);
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd, kMaxGranularity);
+                          kHighMemEnd);
 }
 
 }  // namespace __hwasan
@@ -140,7 +135,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
                                     RingBufferSize());
 #  endif
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd, GetMmapGranularity());
+                          kHighMemEnd);
 }
 
 }  // namespace __hwasan
diff --git a/compiler-rt/lib/memprof/memprof_linux.cpp b/compiler-rt/lib/memprof/memprof_linux.cpp
index fbe5d250f840b..26a2b456b874e 100644
--- a/compiler-rt/lib/memprof/memprof_linux.cpp
+++ b/compiler-rt/lib/memprof/memprof_linux.cpp
@@ -58,8 +58,7 @@ void InitializePlatformExceptionHandlers() {}
 uptr FindDynamicShadowStart() {
   uptr shadow_size_bytes = MemToShadowSize(kHighMemEnd);
   return MapDynamicShadow(shadow_size_bytes, SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
-                          GetMmapGranularity());
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
 }
 
 void *MemprofDlSymNext(const char *sym) { return dlsym(RTLD_NEXT, sym); }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 2d1059140c303..c451fc962c529 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -60,10 +60,14 @@ inline int Verbosity() {
   return atomic_load(&current_verbosity, memory_order_relaxed);
 }
 
-#if SANITIZER_ANDROID && !defined(__aarch64__)
-// 32-bit Android only has 4k pages.
-inline uptr GetPageSize() { return 4096; }
-inline uptr GetPageSizeCached() { return 4096; }
+#if SANITIZER_ANDROID
+inline uptr GetPageSize() {
+// Android post-M sysconf(_SC_PAGESIZE) crashes if called from .preinit_array.
+  return 4096;
+}
+inline uptr GetPageSizeCached() {
+  return 4096;
+}
 #else
 uptr GetPageSize();
 extern uptr PageSizeCached;
@@ -73,7 +77,6 @@ inline uptr GetPageSizeCached() {
   return PageSizeCached;
 }
 #endif
-
 uptr GetMmapGranularity();
 uptr GetMaxVirtualAddress();
 uptr GetMaxUserVirtualAddress();
@@ -88,7 +91,6 @@ void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
 
 // Memory management
 void *MmapOrDie(uptr size, const char *mem_type, bool raw_report = false);
-
 inline void *MmapOrDieQuietly(uptr size, const char *mem_type) {
   return MmapOrDie(size, mem_type, /*raw_report*/ true);
 }
@@ -137,8 +139,7 @@ void UnmapFromTo(uptr from, uptr to);
 // shadow_size_bytes bytes on the right, which on linux is mapped no access.
 // The high_mem_end may be updated if the original shadow size doesn't fit.
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end,
-                      uptr granularity);
+                      uptr min_shadow_base_alignment, uptr &high_mem_end);
 
 // Let S = max(shadow_size, num_aliases * alias_size, ring_buffer_size).
 // Reserves 2*S bytes of address space to the right of the returned address and
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index d15caa76efb06..5d2dd3a7a658f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1136,7 +1136,7 @@ uptr GetMaxUserVirtualAddress() {
   return addr;
 }
 
-#  if !SANITIZER_ANDROID || defined(__aarch64__)
+#  if !SANITIZER_ANDROID
 uptr GetPageSize() {
 #    if SANITIZER_LINUX && (defined(__x86_64__) || defined(__i386__)) && \
         defined(EXEC_PAGESIZE)
@@ -1155,7 +1155,7 @@ uptr GetPageSize() {
   return sysconf(_SC_PAGESIZE);  // EXEC_PAGESIZE may not be trustworthy.
 #    endif
 }
-#  endif
+#  endif  // !SANITIZER_ANDROID
 
 uptr ReadBinaryName(/*out*/ char *buf, uptr buf_len) {
 #  if SANITIZER_SOLARIS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 175362183fd78..6d05411222d9e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -995,8 +995,9 @@ void UnmapFromTo(uptr from, uptr to) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
-                      uptr granularity) {
+                      uptr min_shadow_base_alignment,
+                      UNUSED uptr &high_mem_end) {
+  const uptr granularity = GetMmapGranularity();
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index cbdf3e95925bf..24e3d1112520e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -1188,8 +1188,8 @@ uptr GetMaxVirtualAddress() {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end,
-                      uptr granularity) {
+                      uptr min_shadow_base_alignment, uptr &high_mem_end) {
+  const uptr granularity = GetMmapGranularity();
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
index 0b198890fc798..4e5ad8e4693b4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
@@ -384,8 +384,9 @@ bool DontDumpShadowMemory(uptr addr, uptr length) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
-                      uptr granularity) {
+                      uptr min_shadow_base_alignment,
+                      UNUSED uptr &high_mem_end) {
+  const uptr granularity = GetMmapGranularity();
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =

From 6b54ecca885f58cd9449535a8678d93144c62e4d Mon Sep 17 00:00:00 2001
From: Mircea Trofin <mtrofin@google.com>
Date: Mon, 17 Jun 2024 14:37:24 -0700
Subject: [PATCH 17/25] [mlgo] remove `inlining_default` - unused feature

The feature was only exposed for training and was immediately dropped on
the training side. It was bulk-copied into the test model generator, where
it had no effect (the generator always returns a constant).

In the AOT + test model case, since the test model returns a constant, all
input features are pruned by the AOT compiler, so its presence/absence
doesn't matter.
---
 llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp   | 1 -
 llvm/lib/Analysis/models/gen-inline-oz-test-model.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
index 7d51302bcc1ad..3ab0af8d57da1 100644
--- a/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
+++ b/llvm/lib/Analysis/DevelopmentModeInlineAdvisor.cpp
@@ -253,7 +253,6 @@ class LoggingMLInlineAdvice : public MLInlineAdvice {
 };
 
 static const std::vector<TensorSpec> TrainingOnlyFeatures{
-    TensorSpec::createSpec<int64_t>(TFFeedPrefix + "inlining_default", {1}),
     TensorSpec::createSpec<float>(TFFeedPrefix + "discount", {1}),
     TensorSpec::createSpec<float>(TFFeedPrefix + "reward", {1}),
     TensorSpec::createSpec<int32_t>(TFFeedPrefix + "step_type", {1})};
diff --git a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
index 83055890283e8..3846e7a3cee76 100644
--- a/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
+++ b/llvm/lib/Analysis/models/gen-inline-oz-test-model.py
@@ -47,7 +47,6 @@ def get_input_signature():
             "edge_count",
             "callsite_height",
             "cost_estimate",
-            "inlining_default",
             "sroa_savings",
             "sroa_losses",
             "load_elimination",

From ca22469a101e73fd2b2057d98d3f880f064764e5 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 17 Jun 2024 17:56:45 -0400
Subject: [PATCH 18/25] [libc][stdlib] Run freelist_heap_test only in full
 build mode. (#95850)

---
 libc/test/src/stdlib/CMakeLists.txt | 30 +++++++++++++++--------------
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt
index 648404afb5730..0ded674ee0e12 100644
--- a/libc/test/src/stdlib/CMakeLists.txt
+++ b/libc/test/src/stdlib/CMakeLists.txt
@@ -79,20 +79,22 @@ add_libc_test(
     libc.src.__support.CPP.span
 )
 
-add_libc_test(
-  freelist_heap_test
-  SUITE
-    libc-stdlib-tests
-  SRCS
-    freelist_heap_test.cpp
-    freelist_malloc_test.cpp
-  DEPENDS
-    libc.src.__support.CPP.span
-    libc.src.stdlib.freelist_heap
-    libc.src.stdlib.malloc
-    libc.src.string.memcmp
-    libc.src.string.memcpy
-)
+if(LLVM_LIBC_FULL_BUILD)
+  add_libc_test(
+    freelist_heap_test
+    SUITE
+      libc-stdlib-tests
+    SRCS
+      freelist_heap_test.cpp
+      freelist_malloc_test.cpp
+    DEPENDS
+      libc.src.__support.CPP.span
+      libc.src.stdlib.freelist_heap
+      libc.src.stdlib.malloc
+      libc.src.string.memcmp
+      libc.src.string.memcpy
+  )
+endif()
 
 add_fp_unittest(
   strtod_test

From ca1a96364a568be6a2e65e6f12328b23dd35bafc Mon Sep 17 00:00:00 2001
From: Jacques Pienaar <jpienaar@google.com>
Date: Mon, 17 Jun 2024 15:06:15 -0700
Subject: [PATCH 19/25] [mlir][drr] Fix variadic destination emission (#95855)

Its possible for handleResultPattern to emit helpers, these helpers
cannot be interleaved with pushing into the array. Emit into a separate
string to enable helpers to be emitted before the population of vector.

Signed-off-by: Jacques Pienaar <jpienaar@google.com>
---
 mlir/test/mlir-tblgen/rewriter-static-matcher.td | 15 +++++++++++++++
 mlir/tools/mlir-tblgen/RewriterGen.cpp           | 15 +++++++++------
 2 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/mlir/test/mlir-tblgen/rewriter-static-matcher.td b/mlir/test/mlir-tblgen/rewriter-static-matcher.td
index 2907923cb28e4..7a84dfd5706b1 100644
--- a/mlir/test/mlir-tblgen/rewriter-static-matcher.td
+++ b/mlir/test/mlir-tblgen/rewriter-static-matcher.td
@@ -35,6 +35,16 @@ def COp : NS_Op<"c_op", []> {
   let results = (outs AnyInteger);
 }
 
+def DOp : NS_Op<"d_op", []> {
+  let arguments = (ins
+    Variadic<AnyInteger>:$any_integer
+  );
+
+  let results = (outs AnyInteger);
+}
+
+def Foo : NativeCodeCall<"foo($_builder, $0)">;
+
 // Test static matcher for duplicate DagNode
 // ---
 
@@ -53,3 +63,8 @@ def : Pat<(AOp (BOp I32Attr:$attr, I32:$int)),
 // CHECK: if(::mlir::failed([[$DAG_MATCHER]](rewriter, op1, tblgen_ops
 def : Pat<(COp $_, (BOp I32Attr:$attr, I32:$int)),
           (COp $attr, $int)>;
+
+// CHECK: auto [[$VAR:.*]] = foo(
+// CHECK: ::llvm::SmallVector<::mlir::Value, 4> [[$ARR:tblgen_variadic_values_.*]];
+// CHECK: [[$ARR]].push_back([[$VAR]]);
+def : Pat<(AOp $x), (DOp (variadic (Foo $x)))>;
diff --git a/mlir/tools/mlir-tblgen/RewriterGen.cpp b/mlir/tools/mlir-tblgen/RewriterGen.cpp
index d8e16d98fd756..b8a3d5b49938e 100644
--- a/mlir/tools/mlir-tblgen/RewriterGen.cpp
+++ b/mlir/tools/mlir-tblgen/RewriterGen.cpp
@@ -1261,20 +1261,23 @@ std::string PatternEmitter::handleResultPattern(DagNode resultTree,
 std::string PatternEmitter::handleVariadic(DagNode tree, int depth) {
   assert(tree.isVariadic());
 
+  std::string output;
+  llvm::raw_string_ostream oss(output);
   auto name = std::string(formatv("tblgen_variadic_values_{0}", nextValueId++));
   symbolInfoMap.bindValue(name);
-  os << "::llvm::SmallVector<::mlir::Value, 4> " << name << ";\n";
+  oss << "::llvm::SmallVector<::mlir::Value, 4> " << name << ";\n";
   for (int i = 0, e = tree.getNumArgs(); i != e; ++i) {
     if (auto child = tree.getArgAsNestedDag(i)) {
-      os << name << ".push_back(" << handleResultPattern(child, i, depth + 1)
-         << ");\n";
+      oss << name << ".push_back(" << handleResultPattern(child, i, depth + 1)
+          << ");\n";
     } else {
-      os << name << ".push_back("
-         << handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i))
-         << ");\n";
+      oss << name << ".push_back("
+          << handleOpArgument(tree.getArgAsLeaf(i), tree.getArgName(i))
+          << ");\n";
     }
   }
 
+  os << oss.str();
   return name;
 }
 

From 6c17f1cabdee3399feceb478921a8369bde18b16 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Mon, 17 Jun 2024 10:54:46 -0700
Subject: [PATCH 20/25] Fix comments in ValueObjectPrinter.h (NFC)

---
 .../lldb/DataFormatters/ValueObjectPrinter.h  | 21 +++++++++----------
 1 file changed, 10 insertions(+), 11 deletions(-)

diff --git a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
index 32b101a2f9843..fb5d60ba30d77 100644
--- a/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
+++ b/lldb/include/lldb/DataFormatters/ValueObjectPrinter.h
@@ -1,5 +1,4 @@
-//===-- ValueObjectPrinter.h ---------------------------------------*- C++
-//-*-===//
+//===-- ValueObjectPrinter.h ------------------------------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -25,7 +24,7 @@ class ValueObjectPrinter {
   /// does not retain the ValueObject it is printing, that is the job of
   /// its caller.  It also doesn't attempt to track changes in the
   /// ValueObject, e.g. changing synthetic child providers or changing
-  /// dynamic vrs. static vrs. synthetic settings.
+  /// dynamic versus static versus synthetic settings.
 public:
   ValueObjectPrinter(ValueObject &valobj, Stream *s);
 
@@ -42,16 +41,16 @@ class ValueObjectPrinter {
 
   InstancePointersSetSP m_printed_instance_pointers;
 
-  // only this class (and subclasses, if any) should ever be concerned with the
-  // depth mechanism
+  /// Only this class (and subclasses, if any) should ever be
+  /// concerned with the depth mechanism.
   ValueObjectPrinter(ValueObject &valobj, Stream *s,
                      const DumpValueObjectOptions &options,
                      const DumpValueObjectOptions::PointerDepth &ptr_depth,
                      uint32_t curr_depth,
                      InstancePointersSetSP printed_instance_pointers);
 
-  // we should actually be using delegating constructors here but some versions
-  // of GCC still have trouble with those
+  /// Ee should actually be using delegating constructors here but
+  /// some versions of GCC still have trouble with those.
   void Init(ValueObject &valobj, Stream *s,
             const DumpValueObjectOptions &options,
             const DumpValueObjectOptions::PointerDepth &ptr_depth,
@@ -67,7 +66,7 @@ class ValueObjectPrinter {
   /// use dynamic and use synthetic settings of the ValueObject being printed,
   /// so changes made to these settings won't affect already made
   /// ValueObjectPrinters. SetupMostSpecializedValue();
-
+  ///
   /// Access the cached "most specialized value" - that is the one to use for
   /// printing the value object's value.  However, be sure to use
   /// GetValueForChildGeneration when you are generating the children of this
@@ -143,9 +142,9 @@ class ValueObjectPrinter {
   bool ShouldShowName() const;
 
   ValueObject &m_orig_valobj;
-  ValueObject *m_cached_valobj; /// Cache the current "most specialized" value.
-                                /// Don't use this directly, use
-                                /// GetMostSpecializedValue.
+  /// Cache the current "most specialized" value.  Don't use this
+  /// directly, use GetMostSpecializedValue.
+  ValueObject *m_cached_valobj;
   Stream *m_stream;
   DumpValueObjectOptions m_options;
   Flags m_type_flags;

From 68fc8dffe4669fed1f7eb88e6770509505f9960a Mon Sep 17 00:00:00 2001
From: shaw young <58664393+shawbyoung@users.noreply.github.com>
Date: Mon, 17 Jun 2024 13:27:56 -0700
Subject: [PATCH 21/25] [BOLT] Drop high discrepancy profiles in matching
 (#95156)

Summary: Functions with high discrepancy
(measured by matched function blocks)
can be ignored with an added command line
argument for better performance.

Test Plan: Added
stale-matching-min-matched-block.test

---------

Co-authored-by: Amir Ayupov <aaupov@fb.com>
---
 bolt/docs/CommandLineArgumentReference.md     |  7 ++-
 bolt/lib/Profile/StaleProfileMatching.cpp     | 30 +++++++---
 ...arge_profile_stale_low_matched_blocks.yaml | 57 +++++++++++++++++++
 .../X86/stale-matching-min-matched-block.test | 10 ++++
 4 files changed, 96 insertions(+), 8 deletions(-)
 create mode 100644 bolt/test/X86/Inputs/blarge_profile_stale_low_matched_blocks.yaml
 create mode 100644 bolt/test/X86/stale-matching-min-matched-block.test

diff --git a/bolt/docs/CommandLineArgumentReference.md b/bolt/docs/CommandLineArgumentReference.md
index 8887d1f5d5bd4..49e226513028f 100644
--- a/bolt/docs/CommandLineArgumentReference.md
+++ b/bolt/docs/CommandLineArgumentReference.md
@@ -802,6 +802,11 @@
 
   The maximum size of a function to consider for inference.
 
+- `--stale-matching-min-matched-block=<uint>`
+
+  Minimum percent of exact match block for a function to be considered for
+  profile inference.
+
 - `--stale-threshold=<uint>`
 
   Maximum percentage of stale functions to tolerate (default: 100)
@@ -1161,4 +1166,4 @@
 
 - `--print-options`
 
-  Print non-default options after command line parsing
\ No newline at end of file
+  Print non-default options after command line parsing
diff --git a/bolt/lib/Profile/StaleProfileMatching.cpp b/bolt/lib/Profile/StaleProfileMatching.cpp
index 365bc5389266d..c9bcd04281e72 100644
--- a/bolt/lib/Profile/StaleProfileMatching.cpp
+++ b/bolt/lib/Profile/StaleProfileMatching.cpp
@@ -51,6 +51,12 @@ cl::opt<bool>
                       cl::desc("Infer counts from stale profile data."),
                       cl::init(false), cl::Hidden, cl::cat(BoltOptCategory));
 
+cl::opt<unsigned> StaleMatchingMinMatchedBlock(
+    "stale-matching-min-matched-block",
+    cl::desc("Percentage threshold of matched basic blocks at which stale "
+             "profile inference is executed."),
+    cl::init(0), cl::Hidden, cl::cat(BoltOptCategory));
+
 cl::opt<unsigned> StaleMatchingMaxFuncSize(
     "stale-matching-max-func-size",
     cl::desc("The maximum size of a function to consider for inference."),
@@ -391,10 +397,9 @@ createFlowFunction(const BinaryFunction::BasicBlockOrderType &BlockOrder) {
 /// of the basic blocks in the binary, the count is "matched" to the block.
 /// Similarly, if both the source and the target of a count in the profile are
 /// matched to a jump in the binary, the count is recorded in CFG.
-void matchWeightsByHashes(BinaryContext &BC,
-                          const BinaryFunction::BasicBlockOrderType &BlockOrder,
-                          const yaml::bolt::BinaryFunctionProfile &YamlBF,
-                          FlowFunction &Func) {
+size_t matchWeightsByHashes(
+    BinaryContext &BC, const BinaryFunction::BasicBlockOrderType &BlockOrder,
+    const yaml::bolt::BinaryFunctionProfile &YamlBF, FlowFunction &Func) {
   assert(Func.Blocks.size() == BlockOrder.size() + 1);
 
   std::vector<FlowBlock *> Blocks;
@@ -500,6 +505,8 @@ void matchWeightsByHashes(BinaryContext &BC,
     Block.HasUnknownWeight = false;
     Block.Weight = std::max(OutWeight[Block.Index], InWeight[Block.Index]);
   }
+
+  return MatchedBlocks.size();
 }
 
 /// The function finds all blocks that are (i) reachable from the Entry block
@@ -575,10 +582,16 @@ void preprocessUnreachableBlocks(FlowFunction &Func) {
 /// Decide if stale profile matching can be applied for a given function.
 /// Currently we skip inference for (very) large instances and for instances
 /// having "unexpected" control flow (e.g., having no sink basic blocks).
-bool canApplyInference(const FlowFunction &Func) {
+bool canApplyInference(const FlowFunction &Func,
+                       const yaml::bolt::BinaryFunctionProfile &YamlBF,
+                       const uint64_t &MatchedBlocks) {
   if (Func.Blocks.size() > opts::StaleMatchingMaxFuncSize)
     return false;
 
+  if (MatchedBlocks * 100 <
+      opts::StaleMatchingMinMatchedBlock * YamlBF.Blocks.size())
+    return false;
+
   bool HasExitBlocks = llvm::any_of(
       Func.Blocks, [&](const FlowBlock &Block) { return Block.isExit(); });
   if (!HasExitBlocks)
@@ -725,18 +738,21 @@ bool YAMLProfileReader::inferStaleProfile(
   const BinaryFunction::BasicBlockOrderType BlockOrder(
       BF.getLayout().block_begin(), BF.getLayout().block_end());
 
+  // Tracks the number of matched blocks.
+
   // Create a wrapper flow function to use with the profile inference algorithm.
   FlowFunction Func = createFlowFunction(BlockOrder);
 
   // Match as many block/jump counts from the stale profile as possible
-  matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
+  size_t MatchedBlocks =
+      matchWeightsByHashes(BF.getBinaryContext(), BlockOrder, YamlBF, Func);
 
   // Adjust the flow function by marking unreachable blocks Unlikely so that
   // they don't get any counts assigned.
   preprocessUnreachableBlocks(Func);
 
   // Check if profile inference can be applied for the instance.
-  if (!canApplyInference(Func))
+  if (!canApplyInference(Func, YamlBF, MatchedBlocks))
     return false;
 
   // Apply the profile inference algorithm.
diff --git a/bolt/test/X86/Inputs/blarge_profile_stale_low_matched_blocks.yaml b/bolt/test/X86/Inputs/blarge_profile_stale_low_matched_blocks.yaml
new file mode 100644
index 0000000000000..785e23922ce49
--- /dev/null
+++ b/bolt/test/X86/Inputs/blarge_profile_stale_low_matched_blocks.yaml
@@ -0,0 +1,57 @@
+---
+header:
+  profile-version: 1
+  binary-name:     'reader-yaml.test.tmp.exe'
+  binary-build-id: '<unknown>'
+  profile-flags:   [ lbr ]
+  profile-origin:  branch profile reader
+  profile-events:  ''
+  dfs-order:       false
+  hash-func:       xxh3
+functions:
+  - name:            SolveCubic
+    fid:             6
+    hash:            0x0000000000000000
+    exec:            151
+    nblocks:         18
+    blocks:
+      - bid:             0
+        insns:           43
+        hash:            0x4600940a609c0000
+        exec:            151
+        succ:            [ { bid: 1, cnt: 151, mis: 2 }, { bid: 7, cnt: 0 } ]
+      - bid:             1
+        insns:           7
+        hash:            0x167a1f084f130088
+        succ:            [ { bid: 13, cnt: 151 }, { bid: 2, cnt: 0 } ]
+      - bid:             13
+        insns:           26
+        hash:            0xa8d50000f81902a7
+        succ:            [ { bid: 3, cnt: 89 }, { bid: 2, cnt: 10 } ]
+      - bid:             3
+        insns:           9
+        hash:            0xc516000073dc00a0
+        succ:            [ { bid: 5, cnt: 151 } ]
+      - bid:             5
+        insns:           9
+        hash:            0x6446e1ea500111
+  - name:            usqrt
+    fid:             7
+    hash:            0x0000000000000000
+    exec:            20
+    nblocks:         6
+    blocks:
+      - bid:             0
+        insns:           4
+        hash:            0x0000000000000001
+        exec:            20
+        succ:            [ { bid: 1, cnt: 0 } ]
+      - bid:             1
+        insns:           9
+        hash:            0x0000000000000001
+        succ:            [ { bid: 3, cnt: 320, mis: 171 }, { bid: 2, cnt: 0 } ]
+      - bid:             3
+        insns:           2
+        hash:            0x0000000000000001
+        succ:            [ { bid: 1, cnt: 300, mis: 33 }, { bid: 4, cnt: 20 } ]
+...
diff --git a/bolt/test/X86/stale-matching-min-matched-block.test b/bolt/test/X86/stale-matching-min-matched-block.test
new file mode 100644
index 0000000000000..383d4d7dcb9d2
--- /dev/null
+++ b/bolt/test/X86/stale-matching-min-matched-block.test
@@ -0,0 +1,10 @@
+## This script checks the stale-matching-min-matched-block flag.
+
+RUN: yaml2obj %p/Inputs/blarge.yaml &> %t.exe
+
+## Testing "usqrt"
+RUN: llvm-bolt %t.exe -o %t.null --b %p/Inputs/blarge_profile_stale_low_matched_blocks.yaml \
+RUN:   --infer-stale-profile=1 --stale-matching-min-matched-block=75 \
+RUN:   --profile-ignore-hash=1 --debug-only=bolt-prof 2>&1 | FileCheck %s
+
+CHECK:    BOLT-INFO: inferred profile for 1 (50.00% of profiled, 50.00% of stale) functions responsible for 46.31% samples (552 out of 1192)

From 0041582b6ca137ad04e26985a87a1ae45fb0f0bb Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Mon, 17 Jun 2024 14:52:40 -0700
Subject: [PATCH 22/25] [lldb] Fix Python interpreter workaround (attempt #2)

On macOS, to make DYLD_INSERT_LIBRARIES and the Python shim work
together, we have a workaroud that copies the "real" Python interpreter
into the build directory. This doesn't work when running in a virtual
environment, as the copied interpreter cannot find the packages
installed in the virtual environment relative to itself.

Address this issue by copying the Python interpreter into the virtual
environment's `bin` folder, rather than the build folder, when the test
suite detects that it's being run inside a virtual environment.

I'm not thrilled about this solution because it puts a file outside the
build directory. However, given virtual environments are considered
disposable, this seems reasonable.
---
 lldb/test/API/lit.cfg.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 48c5f49e78e01..6d45508ccb916 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -63,13 +63,14 @@ def find_python_interpreter():
     if "DYLD_INSERT_LIBRARIES" not in config.environment:
         return None
 
-    # If we're running in a virtual environment, we already have a copy of the
-    # Python executable.
+    # If we're running in a virtual environment, we have to copy Python into
+    # the virtual environment for it to work.
     if sys.prefix != sys.base_prefix:
-        return None
+        copied_python = os.path.join(sys.prefix, "bin", "copied-python")
+    else:
+        copied_python = os.path.join(config.lldb_build_directory, "copied-python")
 
     # Avoid doing any work if we already copied the binary.
-    copied_python = os.path.join(config.lldb_build_directory, "copied-python")
     if os.path.isfile(copied_python):
         return copied_python
 

From cd48335a8a5e97c4a4942fb4f94a511da3eb60f4 Mon Sep 17 00:00:00 2001
From: Ahmed Bougacha <ahmed@bougacha.org>
Date: Mon, 17 Jun 2024 15:19:24 -0700
Subject: [PATCH 23/25] [docs][Security Group] Update Apple representation.
 (#95491)

Remove Kate; Apple is now represented by Oliver (and myself).
---
 llvm/docs/Security.rst | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 91b762719138d..9bd2b1d435fd0 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -44,7 +44,6 @@ username for an individual isn't available, the brackets will be empty.
 * Ed Maste (individual; FreeBSD) [@emaste]
 * George Burgess IV (Google) [@gburgessiv]
 * Josh Stone (Red Hat; Rust) [@cuviper]
-* Kate McInnes (Apple) []
 * Kristof Beyls (ARM) [@kbeyls]
 * Matthew Riley (Google) [@mmdriley]
 * Nikhil Gupta (Nvidia) []

From c6049e67efaaca34ca8ad93b007397b118574b81 Mon Sep 17 00:00:00 2001
From: Florian Mayer <fmayer@google.com>
Date: Mon, 17 Jun 2024 15:04:57 -0700
Subject: [PATCH 24/25] Reapply "[HWASan] [compiler-rt] support non-4k pages on
 Android" (#95853)

Updated MapDynamicShadow callsite in asan_win.
---
 compiler-rt/lib/asan/asan_linux.cpp             |  3 ++-
 compiler-rt/lib/asan/asan_mac.cpp               |  3 ++-
 compiler-rt/lib/asan/asan_premap_shadow.cpp     |  3 ++-
 compiler-rt/lib/asan/asan_win.cpp               |  3 ++-
 .../lib/hwasan/hwasan_dynamic_shadow.cpp        | 17 +++++++++++------
 compiler-rt/lib/memprof/memprof_linux.cpp       |  3 ++-
 .../lib/sanitizer_common/sanitizer_common.h     | 17 ++++++++---------
 .../lib/sanitizer_common/sanitizer_linux.cpp    |  4 ++--
 .../sanitizer_linux_libcdep.cpp                 |  5 ++---
 .../lib/sanitizer_common/sanitizer_mac.cpp      |  4 ++--
 .../lib/sanitizer_common/sanitizer_win.cpp      |  5 ++---
 11 files changed, 37 insertions(+), 30 deletions(-)

diff --git a/compiler-rt/lib/asan/asan_linux.cpp b/compiler-rt/lib/asan/asan_linux.cpp
index a517de5af00dc..0b470db86748f 100644
--- a/compiler-rt/lib/asan/asan_linux.cpp
+++ b/compiler-rt/lib/asan/asan_linux.cpp
@@ -93,7 +93,8 @@ uptr FindDynamicShadowStart() {
 #  endif
 
   return MapDynamicShadow(shadow_size_bytes, ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 void AsanApplyToGlobals(globals_op_fptr op, const void *needle) {
diff --git a/compiler-rt/lib/asan/asan_mac.cpp b/compiler-rt/lib/asan/asan_mac.cpp
index b250f796e165f..bfc349223258b 100644
--- a/compiler-rt/lib/asan/asan_mac.cpp
+++ b/compiler-rt/lib/asan/asan_mac.cpp
@@ -51,7 +51,8 @@ bool IsSystemHeapAddress (uptr addr) { return false; }
 
 uptr FindDynamicShadowStart() {
   return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 // No-op. Mac does not support static linkage anyway.
diff --git a/compiler-rt/lib/asan/asan_premap_shadow.cpp b/compiler-rt/lib/asan/asan_premap_shadow.cpp
index bed2f62a22511..6e08b8f966507 100644
--- a/compiler-rt/lib/asan/asan_premap_shadow.cpp
+++ b/compiler-rt/lib/asan/asan_premap_shadow.cpp
@@ -33,7 +33,8 @@ uptr PremapShadowSize() {
 // PremapShadowSize() bytes on the right of it are mapped r/o.
 uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), /*mmap_alignment_scale*/ 3,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 bool PremapShadowFailed() {
diff --git a/compiler-rt/lib/asan/asan_win.cpp b/compiler-rt/lib/asan/asan_win.cpp
index cda1f7a91e140..09a13b11cff1f 100644
--- a/compiler-rt/lib/asan/asan_win.cpp
+++ b/compiler-rt/lib/asan/asan_win.cpp
@@ -268,7 +268,8 @@ void PlatformTSDDtor(void *tsd) { AsanThread::TSDDtor(tsd); }
 // ---------------------- Various stuff ---------------- {{{
 uptr FindDynamicShadowStart() {
   return MapDynamicShadow(MemToShadowSize(kHighMemEnd), ASAN_SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 void AsanCheckDynamicRTPrereqs() {}
diff --git a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
index 7642ba6c0bf08..48bc3b631ac07 100644
--- a/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_dynamic_shadow.cpp
@@ -36,15 +36,20 @@ decltype(__hwasan_shadow)* __hwasan_premap_shadow();
 
 namespace __hwasan {
 
+// We cannot call anything in libc here (see comment above), so we need to
+// assume the biggest allowed page size.
+// Android max page size is defined as 16k here:
+// https://android.googlesource.com/platform/bionic/+/main/libc/platform/bionic/page.h#41
+static constexpr uptr kMaxGranularity = 16384;
+
 // Conservative upper limit.
 static uptr PremapShadowSize() {
-  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale,
-                   GetMmapGranularity());
+  return RoundUpTo(GetMaxVirtualAddress() >> kShadowScale, kMaxGranularity);
 }
 
 static uptr PremapShadow() {
   return MapDynamicShadow(PremapShadowSize(), kShadowScale,
-                          kShadowBaseAlignment, kHighMemEnd);
+                          kShadowBaseAlignment, kHighMemEnd, kMaxGranularity);
 }
 
 static bool IsPremapShadowAvailable() {
@@ -56,7 +61,7 @@ static bool IsPremapShadowAvailable() {
 }
 
 static uptr FindPremappedShadowStart(uptr shadow_size_bytes) {
-  const uptr granularity = GetMmapGranularity();
+  const uptr granularity = kMaxGranularity;
   const uptr shadow_start = reinterpret_cast<uptr>(&__hwasan_shadow);
   const uptr premap_shadow_size = PremapShadowSize();
   const uptr shadow_size = RoundUpTo(shadow_size_bytes, granularity);
@@ -109,7 +114,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
   if (IsPremapShadowAvailable())
     return FindPremappedShadowStart(shadow_size_bytes);
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd);
+                          kHighMemEnd, kMaxGranularity);
 }
 
 }  // namespace __hwasan
@@ -135,7 +140,7 @@ uptr FindDynamicShadowStart(uptr shadow_size_bytes) {
                                     RingBufferSize());
 #  endif
   return MapDynamicShadow(shadow_size_bytes, kShadowScale, kShadowBaseAlignment,
-                          kHighMemEnd);
+                          kHighMemEnd, GetMmapGranularity());
 }
 
 }  // namespace __hwasan
diff --git a/compiler-rt/lib/memprof/memprof_linux.cpp b/compiler-rt/lib/memprof/memprof_linux.cpp
index 26a2b456b874e..fbe5d250f840b 100644
--- a/compiler-rt/lib/memprof/memprof_linux.cpp
+++ b/compiler-rt/lib/memprof/memprof_linux.cpp
@@ -58,7 +58,8 @@ void InitializePlatformExceptionHandlers() {}
 uptr FindDynamicShadowStart() {
   uptr shadow_size_bytes = MemToShadowSize(kHighMemEnd);
   return MapDynamicShadow(shadow_size_bytes, SHADOW_SCALE,
-                          /*min_shadow_base_alignment*/ 0, kHighMemEnd);
+                          /*min_shadow_base_alignment*/ 0, kHighMemEnd,
+                          GetMmapGranularity());
 }
 
 void *MemprofDlSymNext(const char *sym) { return dlsym(RTLD_NEXT, sym); }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index c451fc962c529..2d1059140c303 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -60,14 +60,10 @@ inline int Verbosity() {
   return atomic_load(&current_verbosity, memory_order_relaxed);
 }
 
-#if SANITIZER_ANDROID
-inline uptr GetPageSize() {
-// Android post-M sysconf(_SC_PAGESIZE) crashes if called from .preinit_array.
-  return 4096;
-}
-inline uptr GetPageSizeCached() {
-  return 4096;
-}
+#if SANITIZER_ANDROID && !defined(__aarch64__)
+// 32-bit Android only has 4k pages.
+inline uptr GetPageSize() { return 4096; }
+inline uptr GetPageSizeCached() { return 4096; }
 #else
 uptr GetPageSize();
 extern uptr PageSizeCached;
@@ -77,6 +73,7 @@ inline uptr GetPageSizeCached() {
   return PageSizeCached;
 }
 #endif
+
 uptr GetMmapGranularity();
 uptr GetMaxVirtualAddress();
 uptr GetMaxUserVirtualAddress();
@@ -91,6 +88,7 @@ void GetThreadStackAndTls(bool main, uptr *stk_addr, uptr *stk_size,
 
 // Memory management
 void *MmapOrDie(uptr size, const char *mem_type, bool raw_report = false);
+
 inline void *MmapOrDieQuietly(uptr size, const char *mem_type) {
   return MmapOrDie(size, mem_type, /*raw_report*/ true);
 }
@@ -139,7 +137,8 @@ void UnmapFromTo(uptr from, uptr to);
 // shadow_size_bytes bytes on the right, which on linux is mapped no access.
 // The high_mem_end may be updated if the original shadow size doesn't fit.
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end);
+                      uptr min_shadow_base_alignment, uptr &high_mem_end,
+                      uptr granularity);
 
 // Let S = max(shadow_size, num_aliases * alias_size, ring_buffer_size).
 // Reserves 2*S bytes of address space to the right of the returned address and
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
index 5d2dd3a7a658f..d15caa76efb06 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux.cpp
@@ -1136,7 +1136,7 @@ uptr GetMaxUserVirtualAddress() {
   return addr;
 }
 
-#  if !SANITIZER_ANDROID
+#  if !SANITIZER_ANDROID || defined(__aarch64__)
 uptr GetPageSize() {
 #    if SANITIZER_LINUX && (defined(__x86_64__) || defined(__i386__)) && \
         defined(EXEC_PAGESIZE)
@@ -1155,7 +1155,7 @@ uptr GetPageSize() {
   return sysconf(_SC_PAGESIZE);  // EXEC_PAGESIZE may not be trustworthy.
 #    endif
 }
-#  endif  // !SANITIZER_ANDROID
+#  endif
 
 uptr ReadBinaryName(/*out*/ char *buf, uptr buf_len) {
 #  if SANITIZER_SOLARIS
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
index 6d05411222d9e..175362183fd78 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_linux_libcdep.cpp
@@ -995,9 +995,8 @@ void UnmapFromTo(uptr from, uptr to) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment,
-                      UNUSED uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
index 24e3d1112520e..cbdf3e95925bf 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_mac.cpp
@@ -1188,8 +1188,8 @@ uptr GetMaxVirtualAddress() {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment, uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
index 4e5ad8e4693b4..0b198890fc798 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
@@ -384,9 +384,8 @@ bool DontDumpShadowMemory(uptr addr, uptr length) {
 }
 
 uptr MapDynamicShadow(uptr shadow_size_bytes, uptr shadow_scale,
-                      uptr min_shadow_base_alignment,
-                      UNUSED uptr &high_mem_end) {
-  const uptr granularity = GetMmapGranularity();
+                      uptr min_shadow_base_alignment, UNUSED uptr &high_mem_end,
+                      uptr granularity) {
   const uptr alignment =
       Max<uptr>(granularity << shadow_scale, 1ULL << min_shadow_base_alignment);
   const uptr left_padding =

From 013a40d474e3acaa7a090d5e279f2d8a2f18fbd8 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 17 Jun 2024 18:48:33 -0400
Subject: [PATCH 25/25] [Clang][AMDGPU] Add a new builtin type for buffer rsrc

---
 clang/include/clang/AST/ASTContext.h          |  2 +
 clang/include/clang/AST/Type.h                |  3 +
 clang/include/clang/AST/TypeProperties.td     |  4 +
 clang/include/clang/Basic/AMDGPUTypes.def     | 21 +++++
 .../include/clang/Serialization/ASTBitCodes.h |  5 +-
 clang/lib/AST/ASTContext.cpp                  | 16 ++++
 clang/lib/AST/ASTImporter.cpp                 |  4 +
 clang/lib/AST/ExprConstant.cpp                |  2 +
 clang/lib/AST/ItaniumMangle.cpp               |  6 ++
 clang/lib/AST/MicrosoftMangle.cpp             |  2 +
 clang/lib/AST/NSAPI.cpp                       |  2 +
 clang/lib/AST/PrintfFormatString.cpp          |  2 +
 clang/lib/AST/Type.cpp                        |  6 ++
 clang/lib/AST/TypeLoc.cpp                     |  2 +
 clang/lib/CodeGen/CGDebugInfo.cpp             | 11 ++-
 clang/lib/CodeGen/CGDebugInfo.h               |  2 +
 clang/lib/CodeGen/CodeGenTypes.cpp            |  5 ++
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  2 +
 clang/lib/Index/USRGeneration.cpp             |  5 ++
 clang/lib/Sema/Sema.cpp                       |  8 ++
 clang/lib/Sema/SemaExpr.cpp                   |  4 +
 clang/lib/Serialization/ASTCommon.cpp         |  5 ++
 clang/lib/Serialization/ASTReader.cpp         |  5 ++
 clang/test/AST/ast-dump-amdgpu-types.c        | 10 +++
 .../amdgpu-buffer-rsrc-type-debug-info.c      |  8 ++
 .../amdgpu-buffer-rsrc-typeinfo.cpp           |  9 ++
 .../CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl  | 82 +++++++++++++++++++
 clang/test/SemaCXX/amdgpu-buffer-rsrc.cpp     | 17 ++++
 clang/test/SemaHIP/amdgpu-buffer-rsrc.hip     | 20 +++++
 clang/test/SemaOpenCL/amdgpu-buffer-rsrc.cl   | 12 +++
 clang/test/SemaOpenMP/amdgpu-buffer-rsrc.cpp  | 17 ++++
 clang/tools/libclang/CIndex.cpp               |  2 +
 32 files changed, 299 insertions(+), 2 deletions(-)
 create mode 100644 clang/include/clang/Basic/AMDGPUTypes.def
 create mode 100644 clang/test/AST/ast-dump-amdgpu-types.c
 create mode 100644 clang/test/CodeGen/amdgpu-buffer-rsrc-type-debug-info.c
 create mode 100644 clang/test/CodeGenCXX/amdgpu-buffer-rsrc-typeinfo.cpp
 create mode 100644 clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
 create mode 100644 clang/test/SemaCXX/amdgpu-buffer-rsrc.cpp
 create mode 100644 clang/test/SemaHIP/amdgpu-buffer-rsrc.hip
 create mode 100644 clang/test/SemaOpenCL/amdgpu-buffer-rsrc.cl
 create mode 100644 clang/test/SemaOpenMP/amdgpu-buffer-rsrc.cpp

diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
index 53ece996769a8..4d1f440506e09 100644
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -1147,6 +1147,8 @@ class ASTContext : public RefCountedBase<ASTContext> {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) CanQualType SingletonId;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) CanQualType SingletonId;
+#include "clang/Basic/AMDGPUTypes.def"
 
   // Types for deductions in C++0x [stmt.ranged]'s desugaring. Built on demand.
   mutable QualType AutoDeductTy;     // Deduction against 'auto'.
diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index fab233b62d8d1..61246479188e9 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -3015,6 +3015,9 @@ class BuiltinType : public Type {
 // WebAssembly reference types
 #define WASM_TYPE(Name, Id, SingletonId) Id,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+// AMDGPU types
+#define AMDGPU_TYPE(Name, Id, SingletonId) Id,
+#include "clang/Basic/AMDGPUTypes.def"
 // All other builtin types
 #define BUILTIN_TYPE(Id, SingletonId) Id,
 #define LAST_BUILTIN_TYPE(Id) LastKind = Id
diff --git a/clang/include/clang/AST/TypeProperties.td b/clang/include/clang/AST/TypeProperties.td
index 40dd16f080e2e..aba14b222a03a 100644
--- a/clang/include/clang/AST/TypeProperties.td
+++ b/clang/include/clang/AST/TypeProperties.td
@@ -861,6 +861,10 @@ let Class = BuiltinType in {
       case BuiltinType::ID: return ctx.SINGLETON_ID;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
 
+#define AMDGPU_TYPE(NAME, ID, SINGLETON_ID) \
+      case BuiltinType::ID: return ctx.SINGLETON_ID;
+#include "clang/Basic/AMDGPUTypes.def"
+
 #define BUILTIN_TYPE(ID, SINGLETON_ID) \
       case BuiltinType::ID: return ctx.SINGLETON_ID;
 #include "clang/AST/BuiltinTypes.def"
diff --git a/clang/include/clang/Basic/AMDGPUTypes.def b/clang/include/clang/Basic/AMDGPUTypes.def
new file mode 100644
index 0000000000000..e0d7be470a325
--- /dev/null
+++ b/clang/include/clang/Basic/AMDGPUTypes.def
@@ -0,0 +1,21 @@
+//===-- AMDGPUTypes.def - Metadata about AMDGPU types -----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file defines various AMDGPU builtin types.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef AMDGPU_OPAQUE_PTR_TYPE
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, MangledName, AS, Width, Align, Id, SingletonId) \
+  AMDGPU_TYPE(Name, Id, SingletonId)
+#endif
+
+AMDGPU_OPAQUE_PTR_TYPE("__amdgpu_buffer_rsrc_t", "__amdgpu_buffer_rsrc_t", 8, 128, 128, AMDGPUBufferRsrc, AMDGPUBufferRsrcTy)
+
+#undef AMDGPU_TYPE
+#undef AMDGPU_OPAQUE_PTR_TYPE
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index a4728b1c06b3f..24e616f76b9af 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1097,6 +1097,9 @@ enum PredefinedTypeIDs {
 // \brief WebAssembly reference types with auto numeration
 #define WASM_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+// \brief AMDGPU types with auto numeration
+#define AMDGPU_TYPE(Name, Id, SingletonId) PREDEF_TYPE_##Id##_ID,
+#include "clang/Basic/AMDGPUTypes.def"
 
   /// The placeholder type for unresolved templates.
   PREDEF_TYPE_UNRESOLVED_TEMPLATE,
@@ -1109,7 +1112,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 503;
+const unsigned NUM_PREDEF_TYPE_IDS = 504;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 34aa399fda2f8..d389ef12468ee 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1384,6 +1384,13 @@ void ASTContext::InitBuiltinTypes(const TargetInfo &Target,
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
   }
 
+  if (Target.getTriple().isAMDGPU() ||
+      (AuxTarget && AuxTarget->getTriple().isAMDGPU())) {
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  InitBuiltinType(SingletonId, BuiltinType::Id);
+#include "clang/Basic/AMDGPUTypes.def"
+  }
+
   // Builtin type for __objc_yes and __objc_no
   ObjCBuiltinBoolTy = (Target.useSignedCharForObjCBool() ?
                        SignedCharTy : BoolTy);
@@ -2200,6 +2207,13 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const {
     Align = 8;                                                                 \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_OPAQUE_PTR_TYPE(NAME, MANGLEDNAME, AS, WIDTH, ALIGN, ID,        \
+                               SINGLETONID)                                    \
+  case BuiltinType::ID:                                                        \
+    Width = WIDTH;                                                             \
+    Align = ALIGN;                                                             \
+    break;
+#include "clang/Basic/AMDGPUTypes.def"
     }
     break;
   case Type::ObjCObjectPointer:
@@ -8168,6 +8182,8 @@ static char getObjCEncodingForPrimitiveType(const ASTContext *C,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
       {
         DiagnosticsEngine &Diags = C->getDiagnostics();
         unsigned DiagID = Diags.getCustomDiagID(DiagnosticsEngine::Error,
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index 02cd4ed9a6cac..1b67feaae8874 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -1099,6 +1099,10 @@ ExpectedType ASTNodeImporter::VisitBuiltinType(const BuiltinType *T) {
   case BuiltinType::Id:                                                        \
     return Importer.getToContext().SingletonId;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case BuiltinType::Id:                                                        \
+    return Importer.getToContext().SingletonId;
+#include "clang/Basic/AMDGPUTypes.def"
 #define SHARED_SINGLETON_TYPE(Expansion)
 #define BUILTIN_TYPE(Id, SingletonId) \
   case BuiltinType::Id: return Importer.getToContext().SingletonId;
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 3a6c8b4f82cca..dd355a7125c5a 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -11814,6 +11814,8 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
       return GCCTypeClass::None;
 
     case BuiltinType::Dependent:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index ed9e6eeb36c75..203db72c43733 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -3423,6 +3423,12 @@ void CXXNameMangler::mangleType(const BuiltinType *T) {
     Out << 'u' << type_name.size() << type_name;                               \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case BuiltinType::Id:                                                        \
+    type_name = Name;                                                          \
+    Out << 'u' << type_name.size() << type_name;                               \
+    break;
+#include "clang/Basic/AMDGPUTypes.def"
   }
 }
 
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index a863ec7a529b9..d87be5f2043a9 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -2612,6 +2612,8 @@ void MicrosoftCXXNameMangler::mangleType(const BuiltinType *T, Qualifiers,
 #include "clang/Basic/PPCTypes.def"
 #define RVV_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/RISCVVTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::ShortAccum:
   case BuiltinType::Accum:
   case BuiltinType::LongAccum:
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
index 2d16237f5325a..48d1763125e6c 100644
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -453,6 +453,8 @@ NSAPI::getNSNumberFactoryMethodKind(QualType T) const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::BoundMember:
   case BuiltinType::UnresolvedTemplate:
   case BuiltinType::Dependent:
diff --git a/clang/lib/AST/PrintfFormatString.cpp b/clang/lib/AST/PrintfFormatString.cpp
index dd3b38fabb550..3031d76abbd75 100644
--- a/clang/lib/AST/PrintfFormatString.cpp
+++ b/clang/lib/AST/PrintfFormatString.cpp
@@ -865,6 +865,8 @@ bool PrintfSpecifier::fixType(QualType QT, const LangOptions &LangOpt,
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
 #define SIGNED_TYPE(Id, SingletonId)
 #define UNSIGNED_TYPE(Id, SingletonId)
 #define FLOATING_TYPE(Id, SingletonId)
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 33acae2cbafac..656b733a13b0e 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3509,6 +3509,10 @@ StringRef BuiltinType::getName(const PrintingPolicy &Policy) const {
   case Id:                                                                     \
     return Name;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case Id:                                                                     \
+    return Name;
+#include "clang/Basic/AMDGPUTypes.def"
   }
 
   llvm_unreachable("Invalid builtin type.");
@@ -4778,6 +4782,8 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
     case BuiltinType::BuiltinFn:
     case BuiltinType::NullPtr:
     case BuiltinType::IncompleteMatrixIdx:
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index 9dd90d9bf4e54..33e6ccbadc12d 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -428,6 +428,8 @@ TypeSpecifierType BuiltinTypeLoc::getWrittenTypeSpec() const {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::BuiltinFn:
   case BuiltinType::IncompleteMatrixIdx:
   case BuiltinType::ArraySection:
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index 572ba84d22ef5..a072475ba7705 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -865,7 +865,16 @@ llvm::DIType *CGDebugInfo::CreateType(const BuiltinType *BT) {
     return SingletonId;                                                        \
   }
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
-
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, MangledName, AS, Width, Align, Id,        \
+                               SingletonId)                                    \
+  case BuiltinType::Id: {                                                      \
+    if (!SingletonId)                                                          \
+      SingletonId =                                                            \
+          DBuilder.createForwardDecl(llvm::dwarf::DW_TAG_structure_type,       \
+                                     MangledName, TheCU, TheCU->getFile(), 0); \
+    return SingletonId;                                                        \
+  }
+#include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::UChar:
   case BuiltinType::Char_U:
     Encoding = llvm::dwarf::DW_ATE_unsigned_char;
diff --git a/clang/lib/CodeGen/CGDebugInfo.h b/clang/lib/CodeGen/CGDebugInfo.h
index 272c8d6e75965..2731c627d9dc3 100644
--- a/clang/lib/CodeGen/CGDebugInfo.h
+++ b/clang/lib/CodeGen/CGDebugInfo.h
@@ -83,6 +83,8 @@ class CGDebugInfo {
 #include "clang/Basic/OpenCLExtensionTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) llvm::DIType *SingletonId = nullptr;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) llvm::DIType *SingletonId = nullptr;
+#include "clang/Basic/AMDGPUTypes.def"
 
   /// Cache of previously constructed Types.
   llvm::DenseMap<const void *, llvm::TrackingMDRef> TypeCache;
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 0a926e4ac27fe..d823c336e39bf 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -533,6 +533,11 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
       llvm_unreachable("Unexpected wasm reference builtin type!");             \
   } break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_OPAQUE_PTR_TYPE(Name, MangledName, AS, Width, Align, Id,        \
+                               SingletonId)                                    \
+  case BuiltinType::Id:                                                        \
+    return llvm::PointerType::get(getLLVMContext(), AS);
+#include "clang/Basic/AMDGPUTypes.def"
     case BuiltinType::Dependent:
 #define BUILTIN_TYPE(Id, SingletonId)
 #define PLACEHOLDER_TYPE(Id, SingletonId) \
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 5a3e83de625c9..01a735c1437e1 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3362,6 +3362,8 @@ static bool TypeInfoIsInStandardLibrary(const BuiltinType *Ty) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
     case BuiltinType::ShortAccum:
     case BuiltinType::Accum:
     case BuiltinType::LongAccum:
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index 31c4a3345c09d..5036ddee35fd1 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -772,6 +772,11 @@ void USRGenerator::VisitType(QualType T) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case BuiltinType::Id:                                                        \
+    Out << "@BT@" << #Name;                                                    \
+    break;
+#include "clang/Basic/AMDGPUTypes.def"
         case BuiltinType::ShortAccum:
           Out << "@BT@ShortAccum"; break;
         case BuiltinType::Accum:
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 907a05a5d1b49..069978c1b4023 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -497,6 +497,14 @@ void Sema::Initialize() {
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
   }
 
+  if (Context.getTargetInfo().getTriple().isAMDGPU() ||
+      (Context.getAuxTargetInfo() &&
+       Context.getAuxTargetInfo()->getTriple().isAMDGPU())) {
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  addImplicitTypedef(Name, Context.SingletonId);
+#include "clang/Basic/AMDGPUTypes.def"
+  }
+
   if (Context.getTargetInfo().hasBuiltinMSVaList()) {
     DeclarationName MSVaList = &Context.Idents.get("__builtin_ms_va_list");
     if (IdResolver.begin(MSVaList) == IdResolver.end())
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 4db8b4130c3c7..a31cefc540cbf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -6169,6 +6169,8 @@ static bool isPlaceholderToRemoveAsArg(QualType type) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
 #define PLACEHOLDER_TYPE(ID, SINGLETON_ID)
 #define BUILTIN_TYPE(ID, SINGLETON_ID) case BuiltinType::ID:
 #include "clang/AST/BuiltinTypes.def"
@@ -21004,6 +21006,8 @@ ExprResult Sema::CheckPlaceholderExpr(Expr *E) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
 #define BUILTIN_TYPE(Id, SingletonId) case BuiltinType::Id:
 #define PLACEHOLDER_TYPE(Id, SingletonId)
 #include "clang/AST/BuiltinTypes.def"
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
index bc662a87a7bf3..3385cb8aad7e4 100644
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -258,6 +258,11 @@ serialization::TypeIdxFromBuiltin(const BuiltinType *BT) {
     ID = PREDEF_TYPE_##Id##_ID;                                                \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case BuiltinType::Id:                                                        \
+    ID = PREDEF_TYPE_##Id##_ID;                                                \
+    break;
+#include "clang/Basic/AMDGPUTypes.def"
   case BuiltinType::BuiltinFn:
     ID = PREDEF_TYPE_BUILTIN_FN;
     break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index a2c322087fd1e..0810d720bb4e0 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7401,6 +7401,11 @@ QualType ASTReader::GetType(TypeID ID) {
     T = Context.SingletonId;                                                   \
     break;
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId)                                     \
+  case PREDEF_TYPE_##Id##_ID:                                                  \
+    T = Context.SingletonId;                                                   \
+    break;
+#include "clang/Basic/AMDGPUTypes.def"
     }
 
     assert(!T.isNull() && "Unknown predefined type");
diff --git a/clang/test/AST/ast-dump-amdgpu-types.c b/clang/test/AST/ast-dump-amdgpu-types.c
new file mode 100644
index 0000000000000..e032d678f1a09
--- /dev/null
+++ b/clang/test/AST/ast-dump-amdgpu-types.c
@@ -0,0 +1,10 @@
+// REQUIRES: amdgpu-registered-target
+// Test without serialization:
+// RUN: %clang_cc1 -triple amdgcn -ast-dump -ast-dump-filter __amdgpu_buffer_rsrc_t %s | FileCheck %s
+//
+// Test with serialization:
+// RUN: %clang_cc1 -triple amdgcn -emit-pch -o %t %s
+// RUN: %clang_cc1 -x c -triple amdgcn -include-pch %t -ast-dump-all -ast-dump-filter __amdgpu_buffer_rsrc_t /dev/null | sed -e "s/ <undeserialized declarations>//" -e "s/ imported//" | FileCheck %s
+
+// CHECK: TypedefDecl {{.*}} implicit __amdgpu_buffer_rsrc_t
+// CHECK-NEXT: -BuiltinType {{.*}} '__amdgpu_buffer_rsrc_t'
diff --git a/clang/test/CodeGen/amdgpu-buffer-rsrc-type-debug-info.c b/clang/test/CodeGen/amdgpu-buffer-rsrc-type-debug-info.c
new file mode 100644
index 0000000000000..c266fa83e4b62
--- /dev/null
+++ b/clang/test/CodeGen/amdgpu-buffer-rsrc-type-debug-info.c
@@ -0,0 +1,8 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn -emit-llvm -o - %s -debug-info-kind=limited 2>&1 | FileCheck %s
+
+// CHECK: name: "__amdgpu_buffer_rsrc_t",{{.*}}baseType: ![[BT:[0-9]+]]
+// CHECK: [[BT]] = !DICompositeType(tag: DW_TAG_structure_type, name: "__amdgpu_buffer_rsrc_t", {{.*}} flags: DIFlagFwdDecl)
+void test_locals(void) {
+  __amdgpu_buffer_rsrc_t k;
+}
diff --git a/clang/test/CodeGenCXX/amdgpu-buffer-rsrc-typeinfo.cpp b/clang/test/CodeGenCXX/amdgpu-buffer-rsrc-typeinfo.cpp
new file mode 100644
index 0000000000000..a44e7dc5efe6a
--- /dev/null
+++ b/clang/test/CodeGenCXX/amdgpu-buffer-rsrc-typeinfo.cpp
@@ -0,0 +1,9 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn %s -emit-llvm -o - | FileCheck %s
+
+namespace std { class type_info; };
+
+auto &b = typeid(__amdgpu_buffer_rsrc_t);
+
+// CHECK-DAG: @_ZTSu22__amdgpu_buffer_rsrc_t = {{.*}} c"u22__amdgpu_buffer_rsrc_t\00"
+// CHECK-DAG: @_ZTIu22__amdgpu_buffer_rsrc_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu22__amdgpu_buffer_rsrc_t
diff --git a/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
new file mode 100644
index 0000000000000..69dabda08fba6
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/amdgcn-buffer-rsrc-type.cl
@@ -0,0 +1,82 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature
+ // REQUIRES: amdgpu-registered-target
+ // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu verde -emit-llvm -o - %s | FileCheck %s
+
+typedef struct AA_ty {
+  int x;
+  __amdgpu_buffer_rsrc_t r;
+} AA;
+
+AA getAA(void *p);
+__amdgpu_buffer_rsrc_t getBufferImpl(void *p);
+void consumeBuffer(__amdgpu_buffer_rsrc_t);
+
+// CHECK-LABEL: define {{[^@]+}}@getBuffer
+// CHECK-SAME: (ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call ptr addrspace(8) @getBufferImpl(ptr addrspace(5) noundef [[P]]) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    ret ptr addrspace(8) [[CALL]]
+//
+__amdgpu_buffer_rsrc_t getBuffer(void *p) {
+  return getBufferImpl(p);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@consumeBufferPtr
+// CHECK-SAME: (ptr addrspace(5) noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq ptr addrspace(5) [[P]], addrspacecast (ptr null to ptr addrspace(5))
+// CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[IF_END:%.*]], label [[IF_THEN:%.*]]
+// CHECK:       if.then:
+// CHECK-NEXT:    [[TMP0:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[P]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP0]]) #[[ATTR2]]
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
+// CHECK-NEXT:    ret void
+//
+void consumeBufferPtr(__amdgpu_buffer_rsrc_t *p) {
+  if (p)
+    consumeBuffer(*p);
+}
+
+// CHECK-LABEL: define {{[^@]+}}@test
+// CHECK-SAME: (ptr addrspace(5) noundef readonly [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr addrspace(5) [[A]], align 16, !tbaa [[TBAA8:![0-9]+]]
+// CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq ptr addrspace(5) [[A]], addrspacecast (ptr null to ptr addrspace(5))
+// CHECK-NEXT:    [[OR_COND:%.*]] = or i1 [[TOBOOL_NOT_I]], [[TOBOOL_NOT]]
+// CHECK-NEXT:    br i1 [[OR_COND]], label [[IF_END:%.*]], label [[IF_THEN_I:%.*]]
+// CHECK:       if.then.i:
+// CHECK-NEXT:    [[R:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[A]], i32 16
+// CHECK-NEXT:    [[TMP1:%.*]] = load ptr addrspace(8), ptr addrspace(5) [[R]], align 16, !tbaa [[TBAA4]]
+// CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[TMP1]]) #[[ATTR2]]
+// CHECK-NEXT:    br label [[IF_END]]
+// CHECK:       if.end:
+// CHECK-NEXT:    ret void
+//
+void test(AA *a) {
+  if (a->x)
+    consumeBufferPtr(&(a->r));
+}
+
+// CHECK-LABEL: define {{[^@]+}}@bar
+// CHECK-SAME: (ptr addrspace(5) noundef [[P:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[CALL:%.*]] = tail call [[STRUCT_AA_TY:%.*]] @[[GETAA:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]](ptr addrspace(5) noundef [[P]]) #[[ATTR2]]
+// CHECK-NEXT:    [[TMP0:%.*]] = extractvalue [[STRUCT_AA_TY]] [[CALL]], 0
+// CHECK-NEXT:    [[CALL_I:%.*]] = tail call ptr addrspace(8) @getBufferImpl(ptr addrspace(5) noundef [[P]]) #[[ATTR2]]
+// CHECK-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[TMP0]], 0
+// CHECK-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[TEST_EXIT:%.*]], label [[IF_THEN_I_I:%.*]]
+// CHECK:       if.then.i.i:
+// CHECK-NEXT:    tail call void @consumeBuffer(ptr addrspace(8) [[CALL_I]]) #[[ATTR2]]
+// CHECK-NEXT:    br label [[TEST_EXIT]]
+// CHECK:       test.exit:
+// CHECK-NEXT:    [[DOTFCA_1_INSERT:%.*]] = insertvalue [[STRUCT_AA_TY]] [[CALL]], ptr addrspace(8) [[CALL_I]], 1
+// CHECK-NEXT:    ret [[STRUCT_AA_TY]] [[DOTFCA_1_INSERT]]
+//
+AA bar(void *p) {
+  AA a = getAA(p);
+  a.r = getBuffer(p);
+  test(&a);
+  return a;
+}
diff --git a/clang/test/SemaCXX/amdgpu-buffer-rsrc.cpp b/clang/test/SemaCXX/amdgpu-buffer-rsrc.cpp
new file mode 100644
index 0000000000000..80c4c519c4e6b
--- /dev/null
+++ b/clang/test/SemaCXX/amdgpu-buffer-rsrc.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -std=gnu++11 -triple amdgcn -Wno-unused-value %s
+
+void foo() {
+  int n = 100;
+  __amdgpu_buffer_rsrc_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_buffer_rsrc_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+  dynamic_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{invalid target type '__amdgpu_buffer_rsrc_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_buffer_rsrc_t'}}
+  __amdgpu_buffer_rsrc_t k;
+  int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'int *'}}
+  void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_buffer_rsrc_t) == 16, "wrong size");
+static_assert(alignof(__amdgpu_buffer_rsrc_t) == 16, "wrong aignment");
diff --git a/clang/test/SemaHIP/amdgpu-buffer-rsrc.hip b/clang/test/SemaHIP/amdgpu-buffer-rsrc.hip
new file mode 100644
index 0000000000000..3e5b22dc8963d
--- /dev/null
+++ b/clang/test/SemaHIP/amdgpu-buffer-rsrc.hip
@@ -0,0 +1,20 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -fsyntax-only -verify -triple amdgcn -Wno-unused-value %s
+// RUN: %clang_cc1 -fsyntax-only -verify -triple x86_64 -aux-triple amdgcn -Wno-unused-value %s
+
+#define __device__ __attribute__((device))
+
+__device__ void foo() {
+  int n = 100;
+  __amdgpu_buffer_rsrc_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_buffer_rsrc_t' with an rvalue of type 'int'}}
+  static_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+  dynamic_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{invalid target type '__amdgpu_buffer_rsrc_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+  reinterpret_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+  int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_buffer_rsrc_t'}}
+  __amdgpu_buffer_rsrc_t k;
+  int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'int *'}}
+  void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'void *'}}
+}
+
+static_assert(sizeof(__amdgpu_buffer_rsrc_t) == 16, "wrong size");
+static_assert(alignof(__amdgpu_buffer_rsrc_t) == 16, "wrong aignment");
diff --git a/clang/test/SemaOpenCL/amdgpu-buffer-rsrc.cl b/clang/test/SemaOpenCL/amdgpu-buffer-rsrc.cl
new file mode 100644
index 0000000000000..2d74835699c6d
--- /dev/null
+++ b/clang/test/SemaOpenCL/amdgpu-buffer-rsrc.cl
@@ -0,0 +1,12 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -Wno-unused-value %s
+// RUN: %clang_cc1 -verify -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -Wno-unused-value %s
+
+void foo() {
+    int n = 100;
+    __amdgpu_buffer_rsrc_t v = 0; // expected-error {{initializing '__private __amdgpu_buffer_rsrc_t' with an expression of incompatible type 'int'}}
+    int c = v; // expected-error {{initializing '__private int' with an expression of incompatible type '__private __amdgpu_buffer_rsrc_t'}}
+    __amdgpu_buffer_rsrc_t k;
+    int *ip = (int *)k; // expected-error {{operand of type '__amdgpu_buffer_rsrc_t' where arithmetic or pointer type is required}}
+    void *vp = (void *)k; // expected-error {{operand of type '__amdgpu_buffer_rsrc_t' where arithmetic or pointer type is required}}
+ }
diff --git a/clang/test/SemaOpenMP/amdgpu-buffer-rsrc.cpp b/clang/test/SemaOpenMP/amdgpu-buffer-rsrc.cpp
new file mode 100644
index 0000000000000..eb6ded229a75c
--- /dev/null
+++ b/clang/test/SemaOpenMP/amdgpu-buffer-rsrc.cpp
@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=amdgcn-amd-amdhsa -triple amdgcn-amd-amdhsa -fopenmp-is-target-device -Wno-unused-value %s
+
+void foo() {
+#pragma omp target
+  {
+    int n = 100;
+    __amdgpu_buffer_rsrc_t v = 0; // expected-error {{cannot initialize a variable of type '__amdgpu_buffer_rsrc_t' with an rvalue of type 'int'}}
+    static_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{static_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+    dynamic_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{invalid target type '__amdgpu_buffer_rsrc_t' for dynamic_cast; target type must be a reference or pointer type to a defined class}}
+    reinterpret_cast<__amdgpu_buffer_rsrc_t>(n); // expected-error {{reinterpret_cast from 'int' to '__amdgpu_buffer_rsrc_t' is not allowed}}
+    int c(v); // expected-error {{cannot initialize a variable of type 'int' with an lvalue of type '__amdgpu_buffer_rsrc_t'}}
+    __amdgpu_buffer_rsrc_t k;
+    int *ip = (int *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'int *'}}
+    void *vp = (void *)k; // expected-error {{cannot cast from type '__amdgpu_buffer_rsrc_t' to pointer type 'void *'}}
+  }
+ }
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index dcd9555e1bfcc..35312e3d2ae70 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1643,6 +1643,8 @@ bool CursorVisitor::VisitBuiltinTypeLoc(BuiltinTypeLoc TL) {
 #include "clang/Basic/RISCVVTypes.def"
 #define WASM_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
 #include "clang/Basic/WebAssemblyReferenceTypes.def"
+#define AMDGPU_TYPE(Name, Id, SingletonId) case BuiltinType::Id:
+#include "clang/Basic/AMDGPUTypes.def"
 #define BUILTIN_TYPE(Id, SingletonId)
 #define SIGNED_TYPE(Id, SingletonId) case BuiltinType::Id:
 #define UNSIGNED_TYPE(Id, SingletonId) case BuiltinType::Id: