diff --git a/tensorflow/core/util/gpu_kernel_helper.h b/tensorflow/core/util/gpu_kernel_helper.h
index 63c4f89b8c5b71..eb7a698b0a9f16 100644
--- a/tensorflow/core/util/gpu_kernel_helper.h
+++ b/tensorflow/core/util/gpu_kernel_helper.h
@@ -134,9 +134,17 @@ Status GpuLaunchKernel(void (*function)(Ts...), dim3 grid_dim, dim3 block_dim,
       return errors::Internal(cudaGetErrorString(result));
     }
 #elif TENSORFLOW_USE_ROCM
-    hipLaunchKernelGGL(function, grid_dim, block_dim, shared_memory_size_bytes,
-                       stream, std::forward<Args>(arguments)...);
-    TF_RETURN_IF_CUDA_ERROR(hipGetLastError());
+    constexpr size_t count = sizeof...(Args);
+    auto tup_ = std::tuple<Args...>{arguments...};
+    auto tup = validateArgsCountType(function, tup_);
+    void* _Args[count];
+    pArgs<0>(tup, _Args);
+    auto k = reinterpret_cast<void*>(function);
+    auto result =
+        hipLaunchKernel(k, grid_dim, block_dim, _Args, shared_memory_size_bytes, stream);
+    if (result != hipSuccess) {
+      return errors::Internal(hipGetErrorString(result));
+    }
 #endif
   }
   return OkStatus();
diff --git a/tensorflow/workspace2.bzl b/tensorflow/workspace2.bzl
index 9e49ab40b25dd8..e4c57e22ba0f06 100644
--- a/tensorflow/workspace2.bzl
+++ b/tensorflow/workspace2.bzl
@@ -61,6 +61,7 @@ load("//third_party/ruy:workspace.bzl", ruy = "repo")
 load("//third_party/sobol_data:workspace.bzl", sobol_data = "repo")
 load("//third_party/systemlibs:syslibs_configure.bzl", "syslibs_configure")
 load("//third_party/vulkan_headers:workspace.bzl", vulkan_headers = "repo")
+load("@local_xla//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 
 def _initialize_third_party():
     """ Load third party repositories.  See above load() statements. """
@@ -87,6 +88,7 @@ def _initialize_third_party():
     ml_dtypes()
     nanobind()
     nasm()
+    rocm_device_libs()
     opencl_headers()
     pasta()
     pybind11_abseil()
diff --git a/third_party/llvm/0001-clang-CodeGen-sret-args-should-always-point-to-the-a.patch b/third_party/llvm/0001-clang-CodeGen-sret-args-should-always-point-to-the-a.patch
new file mode 100644
index 00000000000000..79a5c7c66858cf
--- /dev/null
+++ b/third_party/llvm/0001-clang-CodeGen-sret-args-should-always-point-to-the-a.patch
@@ -0,0 +1,1664 @@
+From ae0464944300fae4012981b2f04fb93c21fdf228 Mon Sep 17 00:00:00 2001
+From: Alex Voicu <alexandru.voicu@amd.com>
+Date: Fri, 14 Feb 2025 11:20:45 +0000
+Subject: [PATCH] [clang][CodeGen] `sret` args should always point to the
+ `alloca` AS, so use that (#114062)
+
+`sret` arguments are always going to reside in the stack/`alloca`
+address space, which makes the current formulation where their AS is
+derived from the pointee somewhat quaint. This patch ensures that `sret`
+ends up pointing to the `alloca` AS in IR function signatures, and also
+guards agains trying to pass a casted `alloca`d pointer to a `sret` arg,
+which can happen for most languages, when compiled for targets that have
+a non-zero `alloca` AS (e.g. AMDGCN) / map `LangAS::default` to a
+non-zero value (SPIR-V). A target could still choose to do something
+different here, by e.g. overriding `classifyReturnType` behaviour.
+
+In a broader sense, this patch extends non-aliased indirect args to also
+carry an AS, which leads to changing the `getIndirect()` interface. At
+the moment we're only using this for (indirect) returns, but it allows
+for future handling of indirect args themselves. We default to using the
+AllocaAS as that matches what Clang is currently doing, however if, in
+the future, a target would opt for e.g. placing indirect returns in some
+other storage, with another AS, this will require revisiting.
+
+---------
+
+Co-authored-by: Matt Arsenault <arsenm2@gmail.com>
+Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
+---
+ clang/include/clang/CodeGen/CGFunctionInfo.h  | 11 +--
+ clang/lib/CodeGen/ABIInfo.cpp                 |  8 +--
+ clang/lib/CodeGen/ABIInfo.h                   |  3 +-
+ clang/lib/CodeGen/ABIInfoImpl.cpp             | 15 ++--
+ clang/lib/CodeGen/CGCall.cpp                  | 32 +++++----
+ clang/lib/CodeGen/CGExprAgg.cpp               | 19 ++++--
+ clang/lib/CodeGen/ItaniumCXXABI.cpp           |  4 +-
+ clang/lib/CodeGen/MicrosoftCXXABI.cpp         |  4 +-
+ clang/lib/CodeGen/SwiftCallingConv.cpp        | 16 +++--
+ clang/lib/CodeGen/Targets/AArch64.cpp         | 24 ++++---
+ clang/lib/CodeGen/Targets/AMDGPU.cpp          |  3 +-
+ clang/lib/CodeGen/Targets/ARC.cpp             | 11 +--
+ clang/lib/CodeGen/Targets/ARM.cpp             | 32 ++++++---
+ clang/lib/CodeGen/Targets/AVR.cpp             |  2 +-
+ clang/lib/CodeGen/Targets/BPF.cpp             | 12 ++--
+ clang/lib/CodeGen/Targets/CSKY.cpp            |  8 ++-
+ clang/lib/CodeGen/Targets/Hexagon.cpp         | 18 +++--
+ clang/lib/CodeGen/Targets/Lanai.cpp           | 14 ++--
+ clang/lib/CodeGen/Targets/LoongArch.cpp       | 13 ++--
+ clang/lib/CodeGen/Targets/Mips.cpp            | 10 +--
+ clang/lib/CodeGen/Targets/NVPTX.cpp           |  8 ++-
+ clang/lib/CodeGen/Targets/PNaCl.cpp           | 12 ++--
+ clang/lib/CodeGen/Targets/PPC.cpp             | 35 ++++++----
+ clang/lib/CodeGen/Targets/RISCV.cpp           | 13 ++--
+ clang/lib/CodeGen/Targets/SPIR.cpp            |  9 ++-
+ clang/lib/CodeGen/Targets/Sparc.cpp           |  7 +-
+ clang/lib/CodeGen/Targets/SystemZ.cpp         | 14 ++--
+ clang/lib/CodeGen/Targets/WebAssembly.cpp     |  3 +-
+ clang/lib/CodeGen/Targets/X86.cpp             | 58 +++++++++++-----
+ .../test/CodeGen/partial-reinitialization2.c  |  4 +-
+ clang/test/CodeGen/sret.c                     | 11 +++
+ .../test/CodeGenCXX/no-elide-constructors.cpp |  6 ++
+ .../CodeGenOpenCL/addr-space-struct-arg.cl    | 14 ++--
+ .../amdgpu-abi-struct-arg-byref.cl            | 14 ++--
+ ...plicit-addrspacecast-function-parameter.cl | 68 +++++++++++++++++++
+ 35 files changed, 371 insertions(+), 164 deletions(-)
+ create mode 100644 clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
+
+diff --git a/clang/include/clang/CodeGen/CGFunctionInfo.h b/clang/include/clang/CodeGen/CGFunctionInfo.h
+index 9d785d878b61..040ee025afaa 100644
+--- a/clang/include/clang/CodeGen/CGFunctionInfo.h
++++ b/clang/include/clang/CodeGen/CGFunctionInfo.h
+@@ -206,8 +206,8 @@ public:
+   static ABIArgInfo getIgnore() {
+     return ABIArgInfo(Ignore);
+   }
+-  static ABIArgInfo getIndirect(CharUnits Alignment, bool ByVal = true,
+-                                bool Realign = false,
++  static ABIArgInfo getIndirect(CharUnits Alignment, unsigned AddrSpace,
++                                bool ByVal = true, bool Realign = false,
+                                 llvm::Type *Padding = nullptr) {
+     auto AI = ABIArgInfo(Indirect);
+     AI.setIndirectAlign(Alignment);
+@@ -215,6 +215,7 @@ public:
+     AI.setIndirectRealign(Realign);
+     AI.setSRetAfterThis(false);
+     AI.setPaddingType(Padding);
++    AI.setIndirectAddrSpace(AddrSpace);
+     return AI;
+   }
+ 
+@@ -232,7 +233,7 @@ public:
+ 
+   static ABIArgInfo getIndirectInReg(CharUnits Alignment, bool ByVal = true,
+                                      bool Realign = false) {
+-    auto AI = getIndirect(Alignment, ByVal, Realign);
++    auto AI = getIndirect(Alignment, 0, ByVal, Realign);
+     AI.setInReg(true);
+     return AI;
+   }
+@@ -422,12 +423,12 @@ public:
+   }
+ 
+   unsigned getIndirectAddrSpace() const {
+-    assert(isIndirectAliased() && "Invalid kind!");
++    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
+     return IndirectAttr.AddrSpace;
+   }
+ 
+   void setIndirectAddrSpace(unsigned AddrSpace) {
+-    assert(isIndirectAliased() && "Invalid kind!");
++    assert((isIndirect() || isIndirectAliased()) && "Invalid kind!");
+     IndirectAttr.AddrSpace = AddrSpace;
+   }
+ 
+diff --git a/clang/lib/CodeGen/ABIInfo.cpp b/clang/lib/CodeGen/ABIInfo.cpp
+index cda8a494f6c2..d981d6991363 100644
+--- a/clang/lib/CodeGen/ABIInfo.cpp
++++ b/clang/lib/CodeGen/ABIInfo.cpp
+@@ -171,11 +171,11 @@ bool ABIInfo::isPromotableIntegerTypeForABI(QualType Ty) const {
+   return false;
+ }
+ 
+-ABIArgInfo ABIInfo::getNaturalAlignIndirect(QualType Ty, bool ByVal,
+-                                            bool Realign,
++ABIArgInfo ABIInfo::getNaturalAlignIndirect(QualType Ty, unsigned AddrSpace,
++                                            bool ByVal, bool Realign,
+                                             llvm::Type *Padding) const {
+-  return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty), ByVal,
+-                                 Realign, Padding);
++  return ABIArgInfo::getIndirect(getContext().getTypeAlignInChars(Ty),
++                                 AddrSpace, ByVal, Realign, Padding);
+ }
+ 
+ ABIArgInfo ABIInfo::getNaturalAlignIndirectInReg(QualType Ty,
+diff --git a/clang/lib/CodeGen/ABIInfo.h b/clang/lib/CodeGen/ABIInfo.h
+index 213e7879c316..9c7029c99bd4 100644
+--- a/clang/lib/CodeGen/ABIInfo.h
++++ b/clang/lib/CodeGen/ABIInfo.h
+@@ -110,7 +110,8 @@ public:
+   /// A convenience method to return an indirect ABIArgInfo with an
+   /// expected alignment equal to the ABI alignment of the given type.
+   CodeGen::ABIArgInfo
+-  getNaturalAlignIndirect(QualType Ty, bool ByVal = true, bool Realign = false,
++  getNaturalAlignIndirect(QualType Ty, unsigned AddrSpace, bool ByVal = true,
++                          bool Realign = false,
+                           llvm::Type *Padding = nullptr) const;
+ 
+   CodeGen::ABIArgInfo getNaturalAlignIndirectInReg(QualType Ty,
+diff --git a/clang/lib/CodeGen/ABIInfoImpl.cpp b/clang/lib/CodeGen/ABIInfoImpl.cpp
+index 795874059bda..68887cd7916c 100644
+--- a/clang/lib/CodeGen/ABIInfoImpl.cpp
++++ b/clang/lib/CodeGen/ABIInfoImpl.cpp
+@@ -21,9 +21,10 @@ ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const {
+     // Records with non-trivial destructors/copy-constructors should not be
+     // passed by value.
+     if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+-    return getNaturalAlignIndirect(Ty);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+   }
+ 
+   // Treat an enum type as its underlying type.
+@@ -36,7 +37,7 @@ ABIArgInfo DefaultABIInfo::classifyArgumentType(QualType Ty) const {
+         Context.getTypeSize(Context.getTargetInfo().hasInt128Type()
+                                 ? Context.Int128Ty
+                                 : Context.LongLongTy))
+-      return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+ 
+   return (isPromotableIntegerTypeForABI(Ty)
+               ? ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty))
+@@ -48,7 +49,7 @@ ABIArgInfo DefaultABIInfo::classifyReturnType(QualType RetTy) const {
+     return ABIArgInfo::getIgnore();
+ 
+   if (isAggregateTypeForABI(RetTy))
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ 
+   // Treat an enum type as its underlying type.
+   if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
+@@ -59,7 +60,8 @@ ABIArgInfo DefaultABIInfo::classifyReturnType(QualType RetTy) const {
+         getContext().getTypeSize(getContext().getTargetInfo().hasInt128Type()
+                                      ? getContext().Int128Ty
+                                      : getContext().LongLongTy))
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+ 
+   return (isPromotableIntegerTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                                : ABIArgInfo::getDirect());
+@@ -126,7 +128,8 @@ bool CodeGen::classifyReturnType(const CGCXXABI &CXXABI, CGFunctionInfo &FI,
+   if (const auto *RT = Ty->getAs<RecordType>())
+     if (!isa<CXXRecordDecl>(RT->getDecl()) &&
+         !RT->getDecl()->canPassInRegisters()) {
+-      FI.getReturnInfo() = Info.getNaturalAlignIndirect(Ty);
++      FI.getReturnInfo() = Info.getNaturalAlignIndirect(
++          Ty, Info.getDataLayout().getAllocaAddrSpace());
+       return true;
+     }
+ 
+diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
+index 2dce86410db8..e6c2ac939eb8 100644
+--- a/clang/lib/CodeGen/CGCall.cpp
++++ b/clang/lib/CodeGen/CGCall.cpp
+@@ -1671,10 +1671,8 @@ CodeGenTypes::GetFunctionType(const CGFunctionInfo &FI) {
+ 
+   // Add type for sret argument.
+   if (IRFunctionArgs.hasSRetArg()) {
+-    QualType Ret = FI.getReturnType();
+-    unsigned AddressSpace = CGM.getTypes().getTargetAddressSpace(Ret);
+-    ArgTypes[IRFunctionArgs.getSRetArgNo()] =
+-        llvm::PointerType::get(getLLVMContext(), AddressSpace);
++    ArgTypes[IRFunctionArgs.getSRetArgNo()] = llvm::PointerType::get(
++        getLLVMContext(), FI.getReturnInfo().getIndirectAddrSpace());
+   }
+ 
+   // Add type for inalloca argument.
+@@ -5144,7 +5142,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
+   // If the call returns a temporary with struct return, create a temporary
+   // alloca to hold the result, unless one is given to us.
+   Address SRetPtr = Address::invalid();
+-  RawAddress SRetAlloca = RawAddress::invalid();
+   llvm::Value *UnusedReturnSizePtr = nullptr;
+   if (RetAI.isIndirect() || RetAI.isInAlloca() || RetAI.isCoerceAndExpand()) {
+     // For virtual function pointer thunks and musttail calls, we must always
+@@ -5158,11 +5155,11 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
+     } else if (!ReturnValue.isNull()) {
+       SRetPtr = ReturnValue.getAddress();
+     } else {
+-      SRetPtr = CreateMemTemp(RetTy, "tmp", &SRetAlloca);
++      SRetPtr = CreateMemTempWithoutCast(RetTy, "tmp");
+       if (HaveInsertPoint() && ReturnValue.isUnused()) {
+         llvm::TypeSize size =
+             CGM.getDataLayout().getTypeAllocSize(ConvertTypeForMem(RetTy));
+-        UnusedReturnSizePtr = EmitLifetimeStart(size, SRetAlloca.getPointer());
++        UnusedReturnSizePtr = EmitLifetimeStart(size, SRetPtr.getBasePointer());
+       }
+     }
+     if (IRFunctionArgs.hasSRetArg()) {
+@@ -5397,11 +5394,22 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
+             V->getType()->isIntegerTy())
+           V = Builder.CreateZExt(V, ArgInfo.getCoerceToType());
+ 
+-        // If the argument doesn't match, perform a bitcast to coerce it.  This
+-        // can happen due to trivial type mismatches.
++        // The only plausible mismatch here would be for pointer address spaces,
++        // which can happen e.g. when passing a sret arg that is in the AllocaAS
++        // to a function that takes a pointer to and argument in the DefaultAS.
++        // We assume that the target has a reasonable mapping for the DefaultAS
++        // (it can be casted to from incoming specific ASes), and insert an AS
++        // cast to address the mismatch.
+         if (FirstIRArg < IRFuncTy->getNumParams() &&
+-            V->getType() != IRFuncTy->getParamType(FirstIRArg))
+-          V = Builder.CreateBitCast(V, IRFuncTy->getParamType(FirstIRArg));
++            V->getType() != IRFuncTy->getParamType(FirstIRArg)) {
++          assert(V->getType()->isPointerTy() && "Only pointers can mismatch!");
++          auto FormalAS = CallInfo.arguments()[ArgNo]
++                              .type.getQualifiers()
++                              .getAddressSpace();
++          auto ActualAS = I->Ty.getAddressSpace();
++          V = getTargetHooks().performAddrSpaceCast(
++              *this, V, ActualAS, FormalAS, IRFuncTy->getParamType(FirstIRArg));
++        }
+ 
+         if (ArgHasMaybeUndefAttr)
+           V = Builder.CreateFreeze(V);
+@@ -5737,7 +5745,7 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo,
+   // pop this cleanup later on. Being eager about this is OK, since this
+   // temporary is 'invisible' outside of the callee.
+   if (UnusedReturnSizePtr)
+-    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetAlloca,
++    pushFullExprCleanup<CallLifetimeEnd>(NormalEHLifetimeMarker, SRetPtr,
+                                          UnusedReturnSizePtr);
+ 
+   llvm::BasicBlock *InvokeDest = CannotThrow ? nullptr : getInvokeDest();
+diff --git a/clang/lib/CodeGen/CGExprAgg.cpp b/clang/lib/CodeGen/CGExprAgg.cpp
+index 2ad6587089f1..f9c9c5df8016 100644
+--- a/clang/lib/CodeGen/CGExprAgg.cpp
++++ b/clang/lib/CodeGen/CGExprAgg.cpp
+@@ -296,18 +296,25 @@ void AggExprEmitter::withReturnValueSlot(
+                  (RequiresDestruction && Dest.isIgnored());
+ 
+   Address RetAddr = Address::invalid();
+-  RawAddress RetAllocaAddr = RawAddress::invalid();
+ 
+   EHScopeStack::stable_iterator LifetimeEndBlock;
+   llvm::Value *LifetimeSizePtr = nullptr;
+   llvm::IntrinsicInst *LifetimeStartInst = nullptr;
+   if (!UseTemp) {
+-    RetAddr = Dest.getAddress();
++    // It is possible for the existing slot we are using directly to have been
++    // allocated in the correct AS for an indirect return, and then cast to
++    // the default AS (this is the behaviour of CreateMemTemp), however we know
++    // that the return address is expected to point to the uncasted AS, hence we
++    // strip possible pointer casts here.
++    if (Dest.getAddress().isValid())
++      RetAddr = Dest.getAddress().withPointer(
++          Dest.getAddress().getBasePointer()->stripPointerCasts(),
++          Dest.getAddress().isKnownNonNull());
+   } else {
+-    RetAddr = CGF.CreateMemTemp(RetTy, "tmp", &RetAllocaAddr);
++    RetAddr = CGF.CreateMemTempWithoutCast(RetTy, "tmp");
+     llvm::TypeSize Size =
+         CGF.CGM.getDataLayout().getTypeAllocSize(CGF.ConvertTypeForMem(RetTy));
+-    LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAllocaAddr.getPointer());
++    LifetimeSizePtr = CGF.EmitLifetimeStart(Size, RetAddr.getBasePointer());
+     if (LifetimeSizePtr) {
+       LifetimeStartInst =
+           cast<llvm::IntrinsicInst>(std::prev(Builder.GetInsertPoint()));
+@@ -316,7 +323,7 @@ void AggExprEmitter::withReturnValueSlot(
+              "Last insertion wasn't a lifetime.start?");
+ 
+       CGF.pushFullExprCleanup<CodeGenFunction::CallLifetimeEnd>(
+-          NormalEHLifetimeMarker, RetAllocaAddr, LifetimeSizePtr);
++          NormalEHLifetimeMarker, RetAddr, LifetimeSizePtr);
+       LifetimeEndBlock = CGF.EHStack.stable_begin();
+     }
+   }
+@@ -337,7 +344,7 @@ void AggExprEmitter::withReturnValueSlot(
+     // Since we're not guaranteed to be in an ExprWithCleanups, clean up
+     // eagerly.
+     CGF.DeactivateCleanupBlock(LifetimeEndBlock, LifetimeStartInst);
+-    CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAllocaAddr.getPointer());
++    CGF.EmitLifetimeEnd(LifetimeSizePtr, RetAddr.getBasePointer());
+   }
+ }
+ 
+diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
+index 7375a511809b..bcd171724c41 100644
+--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
++++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
+@@ -1350,7 +1350,9 @@ bool ItaniumCXXABI::classifyReturnType(CGFunctionInfo &FI) const {
+   // If C++ prohibits us from making a copy, return by address.
+   if (!RD->canPassInRegisters()) {
+     auto Align = CGM.getContext().getTypeAlignInChars(FI.getReturnType());
+-    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++    FI.getReturnInfo() = ABIArgInfo::getIndirect(
++        Align, /*AddrSpace=*/CGM.getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/false);
+     return true;
+   }
+   return false;
+diff --git a/clang/lib/CodeGen/MicrosoftCXXABI.cpp b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+index 4a2630e83b62..5cb742a92a9b 100644
+--- a/clang/lib/CodeGen/MicrosoftCXXABI.cpp
++++ b/clang/lib/CodeGen/MicrosoftCXXABI.cpp
+@@ -1172,7 +1172,9 @@ bool MicrosoftCXXABI::classifyReturnType(CGFunctionInfo &FI) const {
+ 
+   if (isIndirectReturn) {
+     CharUnits Align = CGM.getContext().getTypeAlignInChars(FI.getReturnType());
+-    FI.getReturnInfo() = ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++    FI.getReturnInfo() = ABIArgInfo::getIndirect(
++        Align, /*AddrSpace=*/CGM.getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/false);
+ 
+     // MSVC always passes `this` before the `sret` parameter.
+     FI.getReturnInfo().setSRetAfterThis(FI.isInstanceMethod());
+diff --git a/clang/lib/CodeGen/SwiftCallingConv.cpp b/clang/lib/CodeGen/SwiftCallingConv.cpp
+index 1ff4ece2811e..10f9f20bca31 100644
+--- a/clang/lib/CodeGen/SwiftCallingConv.cpp
++++ b/clang/lib/CodeGen/SwiftCallingConv.cpp
+@@ -796,11 +796,14 @@ bool swiftcall::mustPassRecordIndirectly(CodeGenModule &CGM,
+ 
+ static ABIArgInfo classifyExpandedType(SwiftAggLowering &lowering,
+                                        bool forReturn,
+-                                       CharUnits alignmentForIndirect) {
++                                       CharUnits alignmentForIndirect,
++                                       unsigned IndirectAS) {
+   if (lowering.empty()) {
+     return ABIArgInfo::getIgnore();
+   } else if (lowering.shouldPassIndirectly(forReturn)) {
+-    return ABIArgInfo::getIndirect(alignmentForIndirect, /*byval*/ false);
++    return ABIArgInfo::getIndirect(alignmentForIndirect,
++                                   /*AddrSpace=*/IndirectAS,
++                                   /*byval=*/false);
+   } else {
+     auto types = lowering.getCoerceAndExpandTypes();
+     return ABIArgInfo::getCoerceAndExpand(types.first, types.second);
+@@ -809,18 +812,21 @@ static ABIArgInfo classifyExpandedType(SwiftAggLowering &lowering,
+ 
+ static ABIArgInfo classifyType(CodeGenModule &CGM, CanQualType type,
+                                bool forReturn) {
++  unsigned IndirectAS = CGM.getDataLayout().getAllocaAddrSpace();
+   if (auto recordType = dyn_cast<RecordType>(type)) {
+     auto record = recordType->getDecl();
+     auto &layout = CGM.getContext().getASTRecordLayout(record);
+ 
+     if (mustPassRecordIndirectly(CGM, record))
+-      return ABIArgInfo::getIndirect(layout.getAlignment(), /*byval*/ false);
++      return ABIArgInfo::getIndirect(layout.getAlignment(),
++                                     /*AddrSpace=*/IndirectAS, /*byval=*/false);
+ 
+     SwiftAggLowering lowering(CGM);
+     lowering.addTypedData(recordType->getDecl(), CharUnits::Zero(), layout);
+     lowering.finish();
+ 
+-    return classifyExpandedType(lowering, forReturn, layout.getAlignment());
++    return classifyExpandedType(lowering, forReturn, layout.getAlignment(),
++                                IndirectAS);
+   }
+ 
+   // Just assume that all of our target ABIs can support returning at least
+@@ -836,7 +842,7 @@ static ABIArgInfo classifyType(CodeGenModule &CGM, CanQualType type,
+     lowering.finish();
+ 
+     CharUnits alignment = CGM.getContext().getTypeAlignInChars(type);
+-    return classifyExpandedType(lowering, forReturn, alignment);
++    return classifyExpandedType(lowering, forReturn, alignment, IndirectAS);
+   }
+ 
+   // Member pointer types need to be expanded, but it's a simple form of
+diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp
+index e2e434815d43..e2b65a758241 100644
+--- a/clang/lib/CodeGen/Targets/AArch64.cpp
++++ b/clang/lib/CodeGen/Targets/AArch64.cpp
+@@ -327,7 +327,8 @@ ABIArgInfo AArch64ABIInfo::coerceIllegalVector(QualType Ty, unsigned &NSRN,
+     return ABIArgInfo::getDirect(ResType);
+   }
+ 
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                 /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo AArch64ABIInfo::coerceAndExpandPureScalableAggregate(
+@@ -335,7 +336,8 @@ ABIArgInfo AArch64ABIInfo::coerceAndExpandPureScalableAggregate(
+     const SmallVectorImpl<llvm::Type *> &UnpaddedCoerceToSeq, unsigned &NSRN,
+     unsigned &NPRN) const {
+   if (!IsNamedArg || NSRN + NVec > 8 || NPRN + NPred > 4)
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   /*ByVal=*/false);
+   NSRN += NVec;
+   NPRN += NPred;
+ 
+@@ -375,7 +377,8 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
+ 
+     if (const auto *EIT = Ty->getAs<BitIntType>())
+       if (EIT->getNumBits() > 128)
+-        return getNaturalAlignIndirect(Ty, false);
++        return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                       false);
+ 
+     if (Ty->isVectorType())
+       NSRN = std::min(NSRN + 1, 8u);
+@@ -411,8 +414,9 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
+   // Structures with either a non-trivial destructor or a non-trivial
+   // copy constructor are always indirect.
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+-                                     CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(
++        Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/RAA == CGCXXABI::RAA_DirectInMemory);
+   }
+ 
+   // Empty records:
+@@ -489,7 +493,8 @@ ABIArgInfo AArch64ABIInfo::classifyArgumentType(QualType Ty, bool IsVariadicFn,
+                           : llvm::ArrayType::get(BaseTy, Size / Alignment));
+   }
+ 
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                 /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
+@@ -507,7 +512,7 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
+ 
+   // Large vector types should be returned via memory.
+   if (RetTy->isVectorType() && getContext().getTypeSize(RetTy) > 128)
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ 
+   if (!passAsAggregateType(RetTy)) {
+     // Treat an enum type as its underlying type.
+@@ -516,7 +521,8 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
+ 
+     if (const auto *EIT = RetTy->getAs<BitIntType>())
+       if (EIT->getNumBits() > 128)
+-        return getNaturalAlignIndirect(RetTy);
++        return getNaturalAlignIndirect(RetTy,
++                                       getDataLayout().getAllocaAddrSpace());
+ 
+     return (isPromotableIntegerTypeForABI(RetTy) && isDarwinPCS()
+                 ? ABIArgInfo::getExtend(RetTy)
+@@ -575,7 +581,7 @@ ABIArgInfo AArch64ABIInfo::classifyReturnType(QualType RetTy,
+     return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Size));
+   }
+ 
+-  return getNaturalAlignIndirect(RetTy);
++  return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ }
+ 
+ /// isIllegalVectorType - check whether the vector type is legal for AArch64.
+diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
+index 788eac5f2823..dc45def4f324 100644
+--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
++++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
+@@ -236,7 +236,8 @@ ABIArgInfo AMDGPUABIInfo::classifyArgumentType(QualType Ty, bool Variadic,
+     // Records with non-trivial destructors/copy-constructors should not be
+     // passed by value.
+     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+     // Ignore empty structs/unions.
+     if (isEmptyRecord(getContext(), Ty, true))
+diff --git a/clang/lib/CodeGen/Targets/ARC.cpp b/clang/lib/CodeGen/Targets/ARC.cpp
+index 1904e8fdb388..c8db7e8f9706 100644
+--- a/clang/lib/CodeGen/Targets/ARC.cpp
++++ b/clang/lib/CodeGen/Targets/ARC.cpp
+@@ -69,16 +69,19 @@ public:
+ 
+ 
+ ABIArgInfo ARCABIInfo::getIndirectByRef(QualType Ty, bool HasFreeRegs) const {
+-  return HasFreeRegs ? getNaturalAlignIndirectInReg(Ty) :
+-                       getNaturalAlignIndirect(Ty, false);
++  return HasFreeRegs ? getNaturalAlignIndirectInReg(Ty)
++                     : getNaturalAlignIndirect(
++                           Ty, getDataLayout().getAllocaAddrSpace(), false);
+ }
+ 
+ ABIArgInfo ARCABIInfo::getIndirectByValue(QualType Ty) const {
+   // Compute the byval alignment.
+   const unsigned MinABIStackAlignInBytes = 4;
+   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
+-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true,
+-                                 TypeAlign > MinABIStackAlignInBytes);
++  return ABIArgInfo::getIndirect(
++      CharUnits::fromQuantity(4),
++      /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++      /*ByVal=*/true, TypeAlign > MinABIStackAlignInBytes);
+ }
+ 
+ RValue ARCABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+diff --git a/clang/lib/CodeGen/Targets/ARM.cpp b/clang/lib/CodeGen/Targets/ARM.cpp
+index 47e31ceeaf29..0aad5703ffac 100644
+--- a/clang/lib/CodeGen/Targets/ARM.cpp
++++ b/clang/lib/CodeGen/Targets/ARM.cpp
+@@ -299,7 +299,9 @@ ABIArgInfo ARMABIInfo::coerceIllegalVector(QualType Ty) const {
+         llvm::Type::getInt32Ty(getVMContext()), Size / 32);
+     return ABIArgInfo::getDirect(ResType);
+   }
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(
++      Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++      /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo ARMABIInfo::classifyHomogeneousAggregate(QualType Ty,
+@@ -381,7 +383,9 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
+ 
+     if (const auto *EIT = Ty->getAs<BitIntType>())
+       if (EIT->getNumBits() > 64)
+-        return getNaturalAlignIndirect(Ty, /*ByVal=*/true);
++        return getNaturalAlignIndirect(
++            Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/true);
+ 
+     return (isPromotableIntegerTypeForABI(Ty)
+                 ? ABIArgInfo::getExtend(Ty, CGT.ConvertType(Ty))
+@@ -389,7 +393,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
+   }
+ 
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+   }
+ 
+   // Empty records are either ignored completely or passed as if they were a
+@@ -429,7 +434,8 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
+     // bigger than 128-bits, they get placed in space allocated by the caller,
+     // and a pointer is passed.
+     return ABIArgInfo::getIndirect(
+-        CharUnits::fromQuantity(getContext().getTypeAlign(Ty) / 8), false);
++        CharUnits::fromQuantity(getContext().getTypeAlign(Ty) / 8),
++        getDataLayout().getAllocaAddrSpace(), false);
+   }
+ 
+   // Support byval for ARM.
+@@ -447,9 +453,10 @@ ABIArgInfo ARMABIInfo::classifyArgumentType(QualType Ty, bool isVariadic,
+   }
+   if (getContext().getTypeSizeInChars(Ty) > CharUnits::fromQuantity(64)) {
+     assert(getABIKind() != ARMABIKind::AAPCS16_VFP && "unexpected byval");
+-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign),
+-                                   /*ByVal=*/true,
+-                                   /*Realign=*/TyAlign > ABIAlign);
++    return ABIArgInfo::getIndirect(
++        CharUnits::fromQuantity(ABIAlign),
++        /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/true, /*Realign=*/TyAlign > ABIAlign);
+   }
+ 
+   // Otherwise, pass by coercing to a structure of the appropriate size.
+@@ -566,7 +573,8 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
+   if (const VectorType *VT = RetTy->getAs<VectorType>()) {
+     // Large vector types should be returned via memory.
+     if (getContext().getTypeSize(RetTy) > 128)
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+     // TODO: FP16/BF16 vectors should be converted to integer vectors
+     // This check is similar  to isIllegalVectorType - refactor?
+     if ((!getTarget().hasLegalHalfType() &&
+@@ -584,7 +592,9 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
+ 
+     if (const auto *EIT = RetTy->getAs<BitIntType>())
+       if (EIT->getNumBits() > 64)
+-        return getNaturalAlignIndirect(RetTy, /*ByVal=*/false);
++        return getNaturalAlignIndirect(
++            RetTy, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/false);
+ 
+     return isPromotableIntegerTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                                 : ABIArgInfo::getDirect();
+@@ -615,7 +625,7 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
+     }
+ 
+     // Otherwise return in memory.
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+   }
+ 
+   // Otherwise this is an AAPCS variant.
+@@ -653,7 +663,7 @@ ABIArgInfo ARMABIInfo::classifyReturnType(QualType RetTy, bool isVariadic,
+     return ABIArgInfo::getDirect(CoerceTy);
+   }
+ 
+-  return getNaturalAlignIndirect(RetTy);
++  return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ }
+ 
+ /// isIllegalVector - check whether Ty is an illegal vector type.
+diff --git a/clang/lib/CodeGen/Targets/AVR.cpp b/clang/lib/CodeGen/Targets/AVR.cpp
+index 50547dd6dec5..26e2a22f14d1 100644
+--- a/clang/lib/CodeGen/Targets/AVR.cpp
++++ b/clang/lib/CodeGen/Targets/AVR.cpp
+@@ -45,7 +45,7 @@ public:
+     // stack slot, along with a pointer as the function's implicit argument.
+     if (getContext().getTypeSize(Ty) > RetRegs * 8) {
+       LargeRet = true;
+-      return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+     }
+     // An i8 return value should not be extended to i16, since AVR has 8-bit
+     // registers.
+diff --git a/clang/lib/CodeGen/Targets/BPF.cpp b/clang/lib/CodeGen/Targets/BPF.cpp
+index 2849222f7a18..880a891083c3 100644
+--- a/clang/lib/CodeGen/Targets/BPF.cpp
++++ b/clang/lib/CodeGen/Targets/BPF.cpp
+@@ -42,7 +42,8 @@ public:
+         }
+         return ABIArgInfo::getDirect(CoerceTy);
+       } else {
+-        return getNaturalAlignIndirect(Ty);
++        return getNaturalAlignIndirect(Ty,
++                                       getDataLayout().getAllocaAddrSpace());
+       }
+     }
+ 
+@@ -52,7 +53,8 @@ public:
+     ASTContext &Context = getContext();
+     if (const auto *EIT = Ty->getAs<BitIntType>())
+       if (EIT->getNumBits() > Context.getTypeSize(Context.Int128Ty))
+-        return getNaturalAlignIndirect(Ty);
++        return getNaturalAlignIndirect(Ty,
++                                       getDataLayout().getAllocaAddrSpace());
+ 
+     return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
+                                               : ABIArgInfo::getDirect());
+@@ -63,7 +65,8 @@ public:
+       return ABIArgInfo::getIgnore();
+ 
+     if (isAggregateTypeForABI(RetTy))
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+ 
+     // Treat an enum type as its underlying type.
+     if (const EnumType *EnumTy = RetTy->getAs<EnumType>())
+@@ -72,7 +75,8 @@ public:
+     ASTContext &Context = getContext();
+     if (const auto *EIT = RetTy->getAs<BitIntType>())
+       if (EIT->getNumBits() > Context.getTypeSize(Context.Int128Ty))
+-        return getNaturalAlignIndirect(RetTy);
++        return getNaturalAlignIndirect(RetTy,
++                                       getDataLayout().getAllocaAddrSpace());
+ 
+     // Caller will do necessary sign/zero extension.
+     return ABIArgInfo::getDirect();
+diff --git a/clang/lib/CodeGen/Targets/CSKY.cpp b/clang/lib/CodeGen/Targets/CSKY.cpp
+index d8720afd1a71..ef26d483a180 100644
+--- a/clang/lib/CodeGen/Targets/CSKY.cpp
++++ b/clang/lib/CodeGen/Targets/CSKY.cpp
+@@ -82,8 +82,9 @@ ABIArgInfo CSKYABIInfo::classifyArgumentType(QualType Ty, int &ArgGPRsLeft,
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+     if (ArgGPRsLeft)
+       ArgGPRsLeft -= 1;
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+-                                           CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(
++        Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/RAA == CGCXXABI::RAA_DirectInMemory);
+   }
+ 
+   // Ignore empty structs/unions.
+@@ -144,7 +145,8 @@ ABIArgInfo CSKYABIInfo::classifyArgumentType(QualType Ty, int &ArgGPRsLeft,
+           llvm::IntegerType::get(getVMContext(), XLen), (Size + 31) / XLen));
+     }
+   }
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                 /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo CSKYABIInfo::classifyReturnType(QualType RetTy) const {
+diff --git a/clang/lib/CodeGen/Targets/Hexagon.cpp b/clang/lib/CodeGen/Targets/Hexagon.cpp
+index 8fd2a81494d9..667599d2d9a6 100644
+--- a/clang/lib/CodeGen/Targets/Hexagon.cpp
++++ b/clang/lib/CodeGen/Targets/Hexagon.cpp
+@@ -105,14 +105,16 @@ ABIArgInfo HexagonABIInfo::classifyArgumentType(QualType Ty,
+       HexagonAdjustRegsLeft(Size, RegsLeft);
+ 
+     if (Size > 64 && Ty->isBitIntType())
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/true);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/true);
+ 
+     return isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
+                                              : ABIArgInfo::getDirect();
+   }
+ 
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+   // Ignore empty records.
+   if (isEmptyRecord(getContext(), Ty, true))
+@@ -122,7 +124,8 @@ ABIArgInfo HexagonABIInfo::classifyArgumentType(QualType Ty,
+   unsigned Align = getContext().getTypeAlign(Ty);
+ 
+   if (Size > 64)
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/true);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   /*ByVal=*/true);
+ 
+   if (HexagonAdjustRegsLeft(Size, RegsLeft))
+     Align = Size <= 32 ? 32 : 64;
+@@ -151,7 +154,8 @@ ABIArgInfo HexagonABIInfo::classifyReturnType(QualType RetTy) const {
+     }
+     // Large vector types should be returned via memory.
+     if (Size > 64)
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+   }
+ 
+   if (!isAggregateTypeForABI(RetTy)) {
+@@ -160,7 +164,8 @@ ABIArgInfo HexagonABIInfo::classifyReturnType(QualType RetTy) const {
+       RetTy = EnumTy->getDecl()->getIntegerType();
+ 
+     if (Size > 64 && RetTy->isBitIntType())
+-      return getNaturalAlignIndirect(RetTy, /*ByVal=*/false);
++      return getNaturalAlignIndirect(
++          RetTy, getDataLayout().getAllocaAddrSpace(), /*ByVal=*/false);
+ 
+     return isPromotableIntegerTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                                 : ABIArgInfo::getDirect();
+@@ -176,7 +181,8 @@ ABIArgInfo HexagonABIInfo::classifyReturnType(QualType RetTy) const {
+     Size = llvm::bit_ceil(Size);
+     return ABIArgInfo::getDirect(llvm::Type::getIntNTy(getVMContext(), Size));
+   }
+-  return getNaturalAlignIndirect(RetTy, /*ByVal=*/true);
++  return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace(),
++                                 /*ByVal=*/true);
+ }
+ 
+ Address HexagonABIInfo::EmitVAArgFromMemory(CodeGenFunction &CGF,
+diff --git a/clang/lib/CodeGen/Targets/Lanai.cpp b/clang/lib/CodeGen/Targets/Lanai.cpp
+index 2578fc0291e7..6f75bd54a8ef 100644
+--- a/clang/lib/CodeGen/Targets/Lanai.cpp
++++ b/clang/lib/CodeGen/Targets/Lanai.cpp
+@@ -72,15 +72,17 @@ ABIArgInfo LanaiABIInfo::getIndirectResult(QualType Ty, bool ByVal,
+       --State.FreeRegs; // Non-byval indirects just use one pointer.
+       return getNaturalAlignIndirectInReg(Ty);
+     }
+-    return getNaturalAlignIndirect(Ty, false);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   false);
+   }
+ 
+   // Compute the byval alignment.
+   const unsigned MinABIStackAlignInBytes = 4;
+   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
+-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true,
+-                                 /*Realign=*/TypeAlign >
+-                                     MinABIStackAlignInBytes);
++  return ABIArgInfo::getIndirect(
++      CharUnits::fromQuantity(4),
++      /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(), /*ByVal=*/true,
++      /*Realign=*/TypeAlign > MinABIStackAlignInBytes);
+ }
+ 
+ ABIArgInfo LanaiABIInfo::classifyArgumentType(QualType Ty,
+@@ -92,7 +94,9 @@ ABIArgInfo LanaiABIInfo::classifyArgumentType(QualType Ty,
+     if (RAA == CGCXXABI::RAA_Indirect) {
+       return getIndirectResult(Ty, /*ByVal=*/false, State);
+     } else if (RAA == CGCXXABI::RAA_DirectInMemory) {
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/true);
++      return getNaturalAlignIndirect(
++          Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++          /*ByVal=*/true);
+     }
+   }
+ 
+diff --git a/clang/lib/CodeGen/Targets/LoongArch.cpp b/clang/lib/CodeGen/Targets/LoongArch.cpp
+index 6c90e48a5ea4..0f689371a60d 100644
+--- a/clang/lib/CodeGen/Targets/LoongArch.cpp
++++ b/clang/lib/CodeGen/Targets/LoongArch.cpp
+@@ -305,8 +305,9 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+     if (GARsLeft)
+       GARsLeft -= 1;
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+-                                           CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(
++        Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/RAA == CGCXXABI::RAA_DirectInMemory);
+   }
+ 
+   uint64_t Size = getContext().getTypeSize(Ty);
+@@ -381,7 +382,9 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+       if (EIT->getNumBits() > 128 ||
+           (!getContext().getTargetInfo().hasInt128Type() &&
+            EIT->getNumBits() > 64))
+-        return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++        return getNaturalAlignIndirect(
++            Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/false);
+     }
+ 
+     return ABIArgInfo::getDirect();
+@@ -404,7 +407,9 @@ ABIArgInfo LoongArchABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+     return ABIArgInfo::getDirect(
+         llvm::ArrayType::get(llvm::IntegerType::get(getVMContext(), GRLen), 2));
+   }
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(
++      Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++      /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo LoongArchABIInfo::classifyReturnType(QualType RetTy) const {
+diff --git a/clang/lib/CodeGen/Targets/Mips.cpp b/clang/lib/CodeGen/Targets/Mips.cpp
+index 771a85c84b35..c025f7312959 100644
+--- a/clang/lib/CodeGen/Targets/Mips.cpp
++++ b/clang/lib/CodeGen/Targets/Mips.cpp
+@@ -226,7 +226,8 @@ MipsABIInfo::classifyArgumentType(QualType Ty, uint64_t &Offset) const {
+ 
+     if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+       Offset = OrigOffset + MinABIStackAlignInBytes;
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+     }
+ 
+     // If we have reached here, aggregates are passed directly by coercing to
+@@ -248,7 +249,7 @@ MipsABIInfo::classifyArgumentType(QualType Ty, uint64_t &Offset) const {
+     if (EIT->getNumBits() > 128 ||
+         (EIT->getNumBits() > 64 &&
+          !getContext().getTargetInfo().hasInt128Type()))
+-      return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+ 
+   // All integral types are promoted to the GPR width.
+   if (Ty->isIntegralOrEnumerationType())
+@@ -327,7 +328,7 @@ ABIArgInfo MipsABIInfo::classifyReturnType(QualType RetTy) const {
+       }
+     }
+ 
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+   }
+ 
+   // Treat an enum type as its underlying type.
+@@ -339,7 +340,8 @@ ABIArgInfo MipsABIInfo::classifyReturnType(QualType RetTy) const {
+     if (EIT->getNumBits() > 128 ||
+         (EIT->getNumBits() > 64 &&
+          !getContext().getTargetInfo().hasInt128Type()))
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+ 
+   if (isPromotableIntegerTypeForABI(RetTy))
+     return ABIArgInfo::getExtend(RetTy);
+diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
+index b82e4ddb9f3f..c236de8db70e 100644
+--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
++++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
+@@ -192,14 +192,18 @@ ABIArgInfo NVPTXABIInfo::classifyArgumentType(QualType Ty) const {
+         return ABIArgInfo::getDirect(
+             CGInfo.getCUDADeviceBuiltinTextureDeviceType());
+     }
+-    return getNaturalAlignIndirect(Ty, /* byval */ true);
++    return getNaturalAlignIndirect(
++        Ty, /* AddrSpace */ getDataLayout().getAllocaAddrSpace(),
++        /* byval */ true);
+   }
+ 
+   if (const auto *EIT = Ty->getAs<BitIntType>()) {
+     if ((EIT->getNumBits() > 128) ||
+         (!getContext().getTargetInfo().hasInt128Type() &&
+          EIT->getNumBits() > 64))
+-      return getNaturalAlignIndirect(Ty, /* byval */ true);
++      return getNaturalAlignIndirect(
++          Ty, /* AddrSpace */ getDataLayout().getAllocaAddrSpace(),
++          /* byval */ true);
+   }
+ 
+   return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
+diff --git a/clang/lib/CodeGen/Targets/PNaCl.cpp b/clang/lib/CodeGen/Targets/PNaCl.cpp
+index 9b7d757df3a3..358010785850 100644
+--- a/clang/lib/CodeGen/Targets/PNaCl.cpp
++++ b/clang/lib/CodeGen/Targets/PNaCl.cpp
+@@ -63,8 +63,9 @@ RValue PNaClABIInfo::EmitVAArg(CodeGenFunction &CGF, Address VAListAddr,
+ ABIArgInfo PNaClABIInfo::classifyArgumentType(QualType Ty) const {
+   if (isAggregateTypeForABI(Ty)) {
+     if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
+-    return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+   } else if (const EnumType *EnumTy = Ty->getAs<EnumType>()) {
+     // Treat an enum type as its underlying type.
+     Ty = EnumTy->getDecl()->getIntegerType();
+@@ -75,7 +76,7 @@ ABIArgInfo PNaClABIInfo::classifyArgumentType(QualType Ty) const {
+     // Treat bit-precise integers as integers if <= 64, otherwise pass
+     // indirectly.
+     if (EIT->getNumBits() > 64)
+-      return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+     return ABIArgInfo::getDirect();
+   }
+ 
+@@ -89,12 +90,13 @@ ABIArgInfo PNaClABIInfo::classifyReturnType(QualType RetTy) const {
+ 
+   // In the PNaCl ABI we always return records/structures on the stack.
+   if (isAggregateTypeForABI(RetTy))
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ 
+   // Treat bit-precise integers as integers if <= 64, otherwise pass indirectly.
+   if (const auto *EIT = RetTy->getAs<BitIntType>()) {
+     if (EIT->getNumBits() > 64)
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+     return ABIArgInfo::getDirect();
+   }
+ 
+diff --git a/clang/lib/CodeGen/Targets/PPC.cpp b/clang/lib/CodeGen/Targets/PPC.cpp
+index 989e46f4b66a..4df4c9f3c347 100644
+--- a/clang/lib/CodeGen/Targets/PPC.cpp
++++ b/clang/lib/CodeGen/Targets/PPC.cpp
+@@ -189,7 +189,7 @@ ABIArgInfo AIXABIInfo::classifyReturnType(QualType RetTy) const {
+     return ABIArgInfo::getIgnore();
+ 
+   if (isAggregateTypeForABI(RetTy))
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+ 
+   return (isPromotableTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                         : ABIArgInfo::getDirect());
+@@ -208,13 +208,16 @@ ABIArgInfo AIXABIInfo::classifyArgumentType(QualType Ty) const {
+     // Records with non-trivial destructors/copy-constructors should not be
+     // passed by value.
+     if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+     CharUnits CCAlign = getParamTypeAlignment(Ty);
+     CharUnits TyAlign = getContext().getTypeAlignInChars(Ty);
+ 
+-    return ABIArgInfo::getIndirect(CCAlign, /*ByVal*/ true,
+-                                   /*Realign*/ TyAlign > CCAlign);
++    return ABIArgInfo::getIndirect(
++        CCAlign, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/true,
++        /*Realign=*/TyAlign > CCAlign);
+   }
+ 
+   return (isPromotableTypeForABI(Ty)
+@@ -833,7 +836,8 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
+   if (Ty->isVectorType()) {
+     uint64_t Size = getContext().getTypeSize(Ty);
+     if (Size > 128)
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/false);
+     else if (Size < 128) {
+       llvm::Type *CoerceTy = llvm::IntegerType::get(getVMContext(), Size);
+       return ABIArgInfo::getDirect(CoerceTy);
+@@ -842,11 +846,13 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
+ 
+   if (const auto *EIT = Ty->getAs<BitIntType>())
+     if (EIT->getNumBits() > 128)
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/true);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/true);
+ 
+   if (isAggregateTypeForABI(Ty)) {
+     if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+     uint64_t ABIAlign = getParamTypeAlignment(Ty).getQuantity();
+     uint64_t TyAlign = getContext().getTypeAlignInChars(Ty).getQuantity();
+@@ -887,9 +893,10 @@ PPC64_SVR4_ABIInfo::classifyArgumentType(QualType Ty) const {
+     }
+ 
+     // All other aggregates are passed ByVal.
+-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(ABIAlign),
+-                                   /*ByVal=*/true,
+-                                   /*Realign=*/TyAlign > ABIAlign);
++    return ABIArgInfo::getIndirect(
++        CharUnits::fromQuantity(ABIAlign),
++        /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/true, /*Realign=*/TyAlign > ABIAlign);
+   }
+ 
+   return (isPromotableTypeForABI(Ty)
+@@ -910,7 +917,8 @@ PPC64_SVR4_ABIInfo::classifyReturnType(QualType RetTy) const {
+   if (RetTy->isVectorType()) {
+     uint64_t Size = getContext().getTypeSize(RetTy);
+     if (Size > 128)
+-      return getNaturalAlignIndirect(RetTy);
++      return getNaturalAlignIndirect(RetTy,
++                                     getDataLayout().getAllocaAddrSpace());
+     else if (Size < 128) {
+       llvm::Type *CoerceTy = llvm::IntegerType::get(getVMContext(), Size);
+       return ABIArgInfo::getDirect(CoerceTy);
+@@ -919,7 +927,8 @@ PPC64_SVR4_ABIInfo::classifyReturnType(QualType RetTy) const {
+ 
+   if (const auto *EIT = RetTy->getAs<BitIntType>())
+     if (EIT->getNumBits() > 128)
+-      return getNaturalAlignIndirect(RetTy, /*ByVal=*/false);
++      return getNaturalAlignIndirect(
++          RetTy, getDataLayout().getAllocaAddrSpace(), /*ByVal=*/false);
+ 
+   if (isAggregateTypeForABI(RetTy)) {
+     // ELFv2 homogeneous aggregates are returned as array types.
+@@ -949,7 +958,7 @@ PPC64_SVR4_ABIInfo::classifyReturnType(QualType RetTy) const {
+     }
+ 
+     // All other aggregates are returned indirectly.
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+   }
+ 
+   return (isPromotableTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+diff --git a/clang/lib/CodeGen/Targets/RISCV.cpp b/clang/lib/CodeGen/Targets/RISCV.cpp
+index 2c48ba37fd20..aa5fb6329c1c 100644
+--- a/clang/lib/CodeGen/Targets/RISCV.cpp
++++ b/clang/lib/CodeGen/Targets/RISCV.cpp
+@@ -410,8 +410,9 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI())) {
+     if (ArgGPRsLeft)
+       ArgGPRsLeft -= 1;
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/RAA ==
+-                                           CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(
++        Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/RAA == CGCXXABI::RAA_DirectInMemory);
+   }
+ 
+   uint64_t Size = getContext().getTypeSize(Ty);
+@@ -492,7 +493,9 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+       if (EIT->getNumBits() > 128 ||
+           (!getContext().getTargetInfo().hasInt128Type() &&
+            EIT->getNumBits() > 64))
+-        return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++        return getNaturalAlignIndirect(
++            Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/false);
+     }
+ 
+     return ABIArgInfo::getDirect();
+@@ -524,7 +527,9 @@ ABIArgInfo RISCVABIInfo::classifyArgumentType(QualType Ty, bool IsFixed,
+           llvm::IntegerType::get(getVMContext(), XLen), 2));
+     }
+   }
+-  return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++  return getNaturalAlignIndirect(
++      Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++      /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo RISCVABIInfo::classifyReturnType(QualType RetTy) const {
+diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
+index 5c75e985e953..b81ed29a5159 100644
+--- a/clang/lib/CodeGen/Targets/SPIR.cpp
++++ b/clang/lib/CodeGen/Targets/SPIR.cpp
+@@ -156,8 +156,10 @@ ABIArgInfo SPIRVABIInfo::classifyKernelArgumentType(QualType Ty) const {
+       // copied to be valid on the device.
+       // This behavior follows the CUDA spec
+       // https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#global-function-argument-processing,
+-      // and matches the NVPTX implementation.
+-      return getNaturalAlignIndirect(Ty, /* byval */ true);
++      // and matches the NVPTX implementation. TODO: hardcoding to 0 should be
++      // revisited if HIPSPV / byval starts making use of the AS of an indirect
++      // arg.
++      return getNaturalAlignIndirect(Ty, /*AddrSpace=*/0, /*byval=*/true);
+     }
+   }
+   return classifyArgumentType(Ty);
+@@ -172,7 +174,8 @@ ABIArgInfo SPIRVABIInfo::classifyArgumentType(QualType Ty) const {
+   // Records with non-trivial destructors/copy-constructors should not be
+   // passed by value.
+   if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+   if (const RecordType *RT = Ty->getAs<RecordType>()) {
+     const RecordDecl *RD = RT->getDecl();
+diff --git a/clang/lib/CodeGen/Targets/Sparc.cpp b/clang/lib/CodeGen/Targets/Sparc.cpp
+index da8c7219be26..9642196b78c6 100644
+--- a/clang/lib/CodeGen/Targets/Sparc.cpp
++++ b/clang/lib/CodeGen/Targets/Sparc.cpp
+@@ -232,7 +232,9 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
+   // Anything too big to fit in registers is passed with an explicit indirect
+   // pointer / sret pointer.
+   if (Size > SizeLimit)
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++    return getNaturalAlignIndirect(
++        Ty, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/false);
+ 
+   // Treat an enum type as its underlying type.
+   if (const EnumType *EnumTy = Ty->getAs<EnumType>())
+@@ -253,7 +255,8 @@ SparcV9ABIInfo::classifyType(QualType Ty, unsigned SizeLimit) const {
+   // If a C++ object has either a non-trivial copy constructor or a non-trivial
+   // destructor, it is passed with an explicit indirect pointer / sret pointer.
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+   // This is a small aggregate type that should be passed in registers.
+   // Build a coercion type from the LLVM struct type.
+diff --git a/clang/lib/CodeGen/Targets/SystemZ.cpp b/clang/lib/CodeGen/Targets/SystemZ.cpp
+index 23c96fa5cf98..c928d3c029ca 100644
+--- a/clang/lib/CodeGen/Targets/SystemZ.cpp
++++ b/clang/lib/CodeGen/Targets/SystemZ.cpp
+@@ -406,7 +406,7 @@ ABIArgInfo SystemZABIInfo::classifyReturnType(QualType RetTy) const {
+   if (isVectorArgumentType(RetTy))
+     return ABIArgInfo::getDirect();
+   if (isCompoundType(RetTy) || getContext().getTypeSize(RetTy) > 64)
+-    return getNaturalAlignIndirect(RetTy);
++    return getNaturalAlignIndirect(RetTy, getDataLayout().getAllocaAddrSpace());
+   return (isPromotableIntegerTypeForABI(RetTy) ? ABIArgInfo::getExtend(RetTy)
+                                                : ABIArgInfo::getDirect());
+ }
+@@ -417,7 +417,8 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
+ 
+   // Handle the generic C++ ABI.
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+   // Integers and enums are extended to full register width.
+   if (isPromotableIntegerTypeForABI(Ty))
+@@ -434,7 +435,8 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
+ 
+   // Values that are not 1, 2, 4 or 8 bytes in size are passed indirectly.
+   if (Size != 8 && Size != 16 && Size != 32 && Size != 64)
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   /*ByVal=*/false);
+ 
+   // Handle small structures.
+   if (const RecordType *RT = Ty->getAs<RecordType>()) {
+@@ -442,7 +444,8 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
+     // fail the size test above.
+     const RecordDecl *RD = RT->getDecl();
+     if (RD->hasFlexibleArrayMember())
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/false);
+ 
+     // The structure is passed as an unextended integer, a float, or a double.
+     if (isFPArgumentType(SingleElementTy)) {
+@@ -459,7 +462,8 @@ ABIArgInfo SystemZABIInfo::classifyArgumentType(QualType Ty) const {
+ 
+   // Non-structure compounds are passed indirectly.
+   if (isCompoundType(Ty))
+-    return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   /*ByVal=*/false);
+ 
+   return ABIArgInfo::getDirect(nullptr);
+ }
+diff --git a/clang/lib/CodeGen/Targets/WebAssembly.cpp b/clang/lib/CodeGen/Targets/WebAssembly.cpp
+index 70a968fe93ca..9217c78a540a 100644
+--- a/clang/lib/CodeGen/Targets/WebAssembly.cpp
++++ b/clang/lib/CodeGen/Targets/WebAssembly.cpp
+@@ -103,7 +103,8 @@ ABIArgInfo WebAssemblyABIInfo::classifyArgumentType(QualType Ty) const {
+     // Records with non-trivial destructors/copy-constructors should not be
+     // passed by value.
+     if (auto RAA = getRecordArgABI(Ty, getCXXABI()))
+-      return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     RAA == CGCXXABI::RAA_DirectInMemory);
+     // Ignore empty structs/unions.
+     if (isEmptyRecord(getContext(), Ty, true))
+       return ABIArgInfo::getIgnore();
+diff --git a/clang/lib/CodeGen/Targets/X86.cpp b/clang/lib/CodeGen/Targets/X86.cpp
+index 5ee5179dd0f3..b3664513c119 100644
+--- a/clang/lib/CodeGen/Targets/X86.cpp
++++ b/clang/lib/CodeGen/Targets/X86.cpp
+@@ -462,7 +462,9 @@ ABIArgInfo X86_32ABIInfo::getIndirectReturnResult(QualType RetTy, CCState &State
+     if (!IsMCUABI)
+       return getNaturalAlignIndirectInReg(RetTy);
+   }
+-  return getNaturalAlignIndirect(RetTy, /*ByVal=*/false);
++  return getNaturalAlignIndirect(
++      RetTy, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++      /*ByVal=*/false);
+ }
+ 
+ ABIArgInfo X86_32ABIInfo::classifyReturnType(QualType RetTy,
+@@ -599,20 +601,26 @@ ABIArgInfo X86_32ABIInfo::getIndirectResult(QualType Ty, bool ByVal,
+       if (!IsMCUABI)
+         return getNaturalAlignIndirectInReg(Ty);
+     }
+-    return getNaturalAlignIndirect(Ty, false);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   false);
+   }
+ 
+   // Compute the byval alignment.
+   unsigned TypeAlign = getContext().getTypeAlign(Ty) / 8;
+   unsigned StackAlign = getTypeStackAlignInBytes(Ty, TypeAlign);
+   if (StackAlign == 0)
+-    return ABIArgInfo::getIndirect(CharUnits::fromQuantity(4), /*ByVal=*/true);
++    return ABIArgInfo::getIndirect(
++        CharUnits::fromQuantity(4),
++        /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/true);
+ 
+   // If the stack alignment is less than the type alignment, realign the
+   // argument.
+   bool Realign = TypeAlign > StackAlign;
+-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(StackAlign),
+-                                 /*ByVal=*/true, Realign);
++  return ABIArgInfo::getIndirect(
++      CharUnits::fromQuantity(StackAlign),
++      /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(), /*ByVal=*/true,
++      Realign);
+ }
+ 
+ X86_32ABIInfo::Class X86_32ABIInfo::classify(QualType Ty) const {
+@@ -2164,13 +2172,13 @@ ABIArgInfo X86_64ABIInfo::getIndirectReturnResult(QualType Ty) const {
+       Ty = EnumTy->getDecl()->getIntegerType();
+ 
+     if (Ty->isBitIntType())
+-      return getNaturalAlignIndirect(Ty);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+ 
+     return (isPromotableIntegerTypeForABI(Ty) ? ABIArgInfo::getExtend(Ty)
+                                               : ABIArgInfo::getDirect());
+   }
+ 
+-  return getNaturalAlignIndirect(Ty);
++  return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace());
+ }
+ 
+ bool X86_64ABIInfo::IsIllegalVectorType(QualType Ty) const {
+@@ -2210,7 +2218,8 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
+   }
+ 
+   if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(Ty, getCXXABI()))
+-    return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++    return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                   RAA == CGCXXABI::RAA_DirectInMemory);
+ 
+   // Compute the byval alignment. We specify the alignment of the byval in all
+   // cases so that the mid-level optimizer knows the alignment of the byval.
+@@ -2247,7 +2256,8 @@ ABIArgInfo X86_64ABIInfo::getIndirectResult(QualType Ty,
+                                                           Size));
+   }
+ 
+-  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(Align));
++  return ABIArgInfo::getIndirect(CharUnits::fromQuantity(Align),
++                                 getDataLayout().getAllocaAddrSpace());
+ }
+ 
+ /// The ABI specifies that a value should be passed in a full vector XMM/YMM
+@@ -3283,12 +3293,13 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+   if (RT) {
+     if (!IsReturnType) {
+       if (CGCXXABI::RecordArgABI RAA = getRecordArgABI(RT, getCXXABI()))
+-        return getNaturalAlignIndirect(Ty, RAA == CGCXXABI::RAA_DirectInMemory);
++        return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                       RAA == CGCXXABI::RAA_DirectInMemory);
+     }
+ 
+     if (RT->getDecl()->hasFlexibleArrayMember())
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
+-
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/false);
+   }
+ 
+   const Type *Base = nullptr;
+@@ -3304,7 +3315,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+           return ABIArgInfo::getDirect();
+         return ABIArgInfo::getExpand();
+       }
+-      return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++      return ABIArgInfo::getIndirect(
++          Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++          /*ByVal=*/false);
+     } else if (IsVectorCall) {
+       if (FreeSSERegs >= NumElts &&
+           (IsReturnType || Ty->isBuiltinType() || Ty->isVectorType())) {
+@@ -3314,7 +3327,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+         return ABIArgInfo::getExpand();
+       } else if (!Ty->isBuiltinType() && !Ty->isVectorType()) {
+         // HVAs are delayed and reclassified in the 2nd step.
+-        return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++        return ABIArgInfo::getIndirect(
++            Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/false);
+       }
+     }
+   }
+@@ -3331,7 +3346,8 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+     // MS x64 ABI requirement: "Any argument that doesn't fit in 8 bytes, or is
+     // not 1, 2, 4, or 8 bytes, must be passed by reference."
+     if (Width > 64 || !llvm::isPowerOf2_64(Width))
+-      return getNaturalAlignIndirect(Ty, /*ByVal=*/false);
++      return getNaturalAlignIndirect(Ty, getDataLayout().getAllocaAddrSpace(),
++                                     /*ByVal=*/false);
+ 
+     // Otherwise, coerce it to a small integer.
+     return ABIArgInfo::getDirect(llvm::IntegerType::get(getVMContext(), Width));
+@@ -3350,7 +3366,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+       if (IsMingw64) {
+         const llvm::fltSemantics *LDF = &getTarget().getLongDoubleFormat();
+         if (LDF == &llvm::APFloat::x87DoubleExtended())
+-          return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++          return ABIArgInfo::getIndirect(
++              Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++              /*ByVal=*/false);
+       }
+       break;
+ 
+@@ -3360,7 +3378,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+       // than 8 bytes are passed indirectly. GCC follows it. We follow it too,
+       // even though it isn't particularly efficient.
+       if (!IsReturnType)
+-        return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++        return ABIArgInfo::getIndirect(
++            Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++            /*ByVal=*/false);
+ 
+       // Mingw64 GCC returns i128 in XMM0. Coerce to v2i64 to handle that.
+       // Clang matches them for compatibility.
+@@ -3380,7 +3400,9 @@ ABIArgInfo WinX86_64ABIInfo::classify(QualType Ty, unsigned &FreeSSERegs,
+     // the power of 2.
+     if (Width <= 64)
+       return ABIArgInfo::getDirect();
+-    return ABIArgInfo::getIndirect(Align, /*ByVal=*/false);
++    return ABIArgInfo::getIndirect(
++        Align, /*AddrSpace=*/getDataLayout().getAllocaAddrSpace(),
++        /*ByVal=*/false);
+   }
+ 
+   return ABIArgInfo::getDirect();
+diff --git a/clang/test/CodeGen/partial-reinitialization2.c b/clang/test/CodeGen/partial-reinitialization2.c
+index e709c1d4ad1e..7949a6955503 100644
+--- a/clang/test/CodeGen/partial-reinitialization2.c
++++ b/clang/test/CodeGen/partial-reinitialization2.c
+@@ -91,8 +91,8 @@ void test5(void)
+ // CHECK-LABEL: test6
+ void test6(void)
+ {
+-  // CHECK: [[LP:%[a-z0-9]+]] = getelementptr{{.*}}%struct.LLP2P2, ptr{{.*}}, i32 0, i32 0
+-  // CHECK: call {{.*}}get456789(ptr {{.*}}[[LP]])
++  // CHECK: [[VAR:%[a-z0-9]+]] = alloca
++  // CHECK: call {{.*}}get456789(ptr {{.*}}sret{{.*}} [[VAR]])
+ 
+   // CHECK: [[CALL:%[a-z0-9]+]] = call {{.*}}@get235()
+   // CHECK: store{{.*}}[[CALL]], {{.*}}[[TMP0:%[a-z0-9.]+]]
+diff --git a/clang/test/CodeGen/sret.c b/clang/test/CodeGen/sret.c
+index 83dce80aa279..883049041b70 100644
+--- a/clang/test/CodeGen/sret.c
++++ b/clang/test/CodeGen/sret.c
+@@ -1,4 +1,5 @@
+ // RUN: %clang_cc1 %s -Wno-strict-prototypes -emit-llvm -o - | FileCheck %s
++// RUN: %clang_cc1 %s -Wno-strict-prototypes -triple amdgcn-amd-amdhsa -emit-llvm -o - | FileCheck --check-prefix=NONZEROALLOCAAS %s
+ 
+ struct abc {
+  long a;
+@@ -6,18 +7,28 @@ struct abc {
+  long c;
+  long d;
+  long e;
++ long f;
++ long g;
++ long h;
++ long i;
++ long j;
+ };
+ 
+ struct abc foo1(void);
+ // CHECK-DAG: declare {{.*}} @foo1(ptr dead_on_unwind writable sret(%struct.abc)
++// NONZEROALLOCAAS-DAG: declare {{.*}} @foo1(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
+ struct abc foo2();
+ // CHECK-DAG: declare {{.*}} @foo2(ptr dead_on_unwind writable sret(%struct.abc)
++// NONZEROALLOCAAS-DAG: declare {{.*}} @foo2(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
+ struct abc foo3(void) { return (struct abc){0}; }
+ // CHECK-DAG: define {{.*}} @foo3(ptr dead_on_unwind noalias writable sret(%struct.abc)
++// NONZEROALLOCAAS-DAG: define {{.*}} @foo3(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.abc)
+ 
+ void bar(void) {
+   struct abc dummy1 = foo1();
+   // CHECK-DAG: call {{.*}} @foo1(ptr dead_on_unwind writable sret(%struct.abc)
++  // NONZEROALLOCAAS-DAG: call {{.*}} @foo1(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
+   struct abc dummy2 = foo2();
+   // CHECK-DAG: call {{.*}} @foo2(ptr dead_on_unwind writable sret(%struct.abc)
++  // NONZEROALLOCAAS-DAG: call {{.*}} @foo2(ptr addrspace(5) dead_on_unwind writable sret(%struct.abc)
+ }
+diff --git a/clang/test/CodeGenCXX/no-elide-constructors.cpp b/clang/test/CodeGenCXX/no-elide-constructors.cpp
+index 750392a43e05..994282debb0d 100644
+--- a/clang/test/CodeGenCXX/no-elide-constructors.cpp
++++ b/clang/test/CodeGenCXX/no-elide-constructors.cpp
+@@ -1,7 +1,9 @@
+ // RUN: %clang_cc1 -std=c++98 -triple i386-unknown-unknown -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX98
+ // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11
++// RUN: %clang_cc1 -std=c++11 -triple amdgcn-amd-amdhsa -fno-elide-constructors -emit-llvm -o - %s | FileCheck %s --check-prefixes=CHECK --check-prefix=CHECK-CXX11-NONZEROALLOCAAS
+ // RUN: %clang_cc1 -std=c++98 -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX98-ELIDE
+ // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11-ELIDE
++// RUN: %clang_cc1 -std=c++11 -triple amdgcn-amd-amdhsa -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-CXX11-NONZEROALLOCAAS-ELIDE
+ 
+ // Reduced from PR12208
+ class X {
+@@ -15,6 +17,7 @@ public:
+ };
+ 
+ // CHECK-LABEL: define{{.*}} void @_Z4Testv(
++// CHECK-SAME: ptr {{.*}}dead_on_unwind noalias writable sret([[CLASS_X:%.*]]) align 1 [[AGG_RESULT:%.*]])
+ X Test()
+ {
+   X x;
+@@ -23,8 +26,11 @@ X Test()
+   // sret argument.
+   // CHECK-CXX98: call void @_ZN1XC1ERKS_(
+   // CHECK-CXX11: call void @_ZN1XC1EOS_(
++  // CHECK-CXX11-NONZEROALLOCAAS: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[AGG_RESULT]] to ptr
++  // CHECK-CXX11-NONZEROALLOCAAS-NEXT: call void @_ZN1XC1EOS_(ptr noundef nonnull align 1 dereferenceable(1) [[TMP0]]
+   // CHECK-CXX98-ELIDE-NOT: call void @_ZN1XC1ERKS_(
+   // CHECK-CXX11-ELIDE-NOT: call void @_ZN1XC1EOS_(
++  // CHECK-CXX11-NONZEROALLOCAAS-ELIDE-NOT: call void @_ZN1XC1EOS_(
+ 
+   // Make sure that the destructor for X is called.
+   // FIXME: This call is present even in the -ELIDE runs, but is guarded by a
+diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+index 57d056b0ff9d..effdeb954680 100644
+--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
++++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+@@ -154,7 +154,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
+ // AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+ // AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
+ // AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+ // AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+ // AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+@@ -164,10 +163,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
+ // AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+ // AMDGCN20-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+ // AMDGCN20-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+-// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
++// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+ // AMDGCN20-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+-// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
+-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
++// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
++// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+ // AMDGCN20-NEXT:    ret void
+ //
+ // SPIR-LABEL: define dso_local spir_kernel void @ker(
+@@ -250,7 +249,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
+ // AMDGCN-NEXT:    ret void
+ //
+ // AMDGCN20-LABEL: define dso_local void @foo_large(
+-// AMDGCN20-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
++// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+ // AMDGCN20-NEXT:  [[ENTRY:.*:]]
+ // AMDGCN20-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+ // AMDGCN20-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+@@ -327,7 +326,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
+ // AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+ // AMDGCN20-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
+ // AMDGCN20-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+-// AMDGCN20-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+ // AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+ // AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+@@ -335,8 +333,8 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
+ // AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+ // AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+-// AMDGCN20-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+-// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
++// AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
++// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+ // AMDGCN20-NEXT:    ret void
+ //
+ // SPIR-LABEL: define dso_local spir_kernel void @ker_large(
+diff --git a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+index 084281a8cada..2f8ba99a3e41 100644
+--- a/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
++++ b/clang/test/CodeGenOpenCL/amdgpu-abi-struct-arg-byref.cl
+@@ -70,7 +70,6 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
+ // AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+ // AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
+ // AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+ // AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+ // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+@@ -80,10 +79,10 @@ Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
+ // AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+ // AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+ // AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+-// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr [[TMP_ASCAST]], i32 0, i32 0
++// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+ // AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+-// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr [[TMP4]], align 4
+-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 64, i1 false)
++// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
++// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+ // AMDGCN-NEXT:    ret void
+ //
+ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
+@@ -91,7 +90,7 @@ kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
+ }
+ 
+ // AMDGCN-LABEL: define dso_local void @foo_large(
+-// AMDGCN-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
++// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+ // AMDGCN-NEXT:  [[ENTRY:.*:]]
+ // AMDGCN-NEXT:    [[COERCE:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+ // AMDGCN-NEXT:    [[IN:%.*]] = addrspacecast ptr addrspace(5) [[COERCE]] to ptr
+@@ -112,7 +111,6 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
+ // AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+ // AMDGCN-NEXT:    [[IN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[IN_ADDR]] to ptr
+ // AMDGCN-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+-// AMDGCN-NEXT:    [[TMP_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMP]] to ptr
+ // AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+ // AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+@@ -120,8 +118,8 @@ Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
+ // AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR_ASCAST]], align 8
+ // AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+ // AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+-// AMDGCN-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP_ASCAST]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+-// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP_ASCAST]], i64 16384, i1 false)
++// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
++// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+ // AMDGCN-NEXT:    ret void
+ //
+ kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
+diff --git a/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
+new file mode 100644
+index 000000000000..4a7bb8227c33
+--- /dev/null
++++ b/clang/test/CodeGenOpenCL/implicit-addrspacecast-function-parameter.cl
+@@ -0,0 +1,68 @@
++// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
++// RUN: %clang_cc1 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -disable-llvm-passes -emit-llvm -o - %s | FileCheck %s
++
++// Check there's no assertion when passing a pointer to an address space
++// qualified argument.
++
++extern void private_ptr(__private int *);
++extern void local_ptr(__local int *);
++extern void generic_ptr(__generic int *);
++
++// CHECK-LABEL: define dso_local void @use_of_private_var(
++// CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
++// CHECK-NEXT:  [[ENTRY:.*:]]
++// CHECK-NEXT:    [[X:%.*]] = alloca i32, align 4, addrspace(5)
++// CHECK-NEXT:    [[X_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X]] to ptr
++// CHECK-NEXT:    call void @llvm.lifetime.start.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4:[0-9]+]]
++// CHECK-NEXT:    store i32 0, ptr [[X_ASCAST]], align 4, !tbaa [[TBAA4:![0-9]+]]
++// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[X_ASCAST]] to ptr addrspace(5)
++// CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5:[0-9]+]]
++// CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ASCAST]]) #[[ATTR5]]
++// CHECK-NEXT:    call void @llvm.lifetime.end.p5(i64 4, ptr addrspace(5) [[X]]) #[[ATTR4]]
++// CHECK-NEXT:    ret void
++//
++void use_of_private_var()
++{
++    int x = 0 ;
++    private_ptr(&x);
++    generic_ptr(&x);
++}
++
++// CHECK-LABEL: define dso_local void @addr_of_arg(
++// CHECK-SAME: i32 noundef [[X:%.*]]) #[[ATTR0]] {
++// CHECK-NEXT:  [[ENTRY:.*:]]
++// CHECK-NEXT:    [[X_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
++// CHECK-NEXT:    [[X_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[X_ADDR]] to ptr
++// CHECK-NEXT:    store i32 [[X]], ptr [[X_ADDR_ASCAST]], align 4, !tbaa [[TBAA4]]
++// CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr [[X_ADDR_ASCAST]] to ptr addrspace(5)
++// CHECK-NEXT:    call void @private_ptr(ptr addrspace(5) noundef [[TMP0]]) #[[ATTR5]]
++// CHECK-NEXT:    call void @generic_ptr(ptr noundef [[X_ADDR_ASCAST]]) #[[ATTR5]]
++// CHECK-NEXT:    ret void
++//
++void addr_of_arg(int x)
++{
++    private_ptr(&x);
++    generic_ptr(&x);
++}
++
++// CHECK-LABEL: define dso_local amdgpu_kernel void @use_of_local_var(
++// CHECK-SAME: ) #[[ATTR3:[0-9]+]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
++// CHECK-NEXT:  [[ENTRY:.*:]]
++// CHECK-NEXT:    call void @local_ptr(ptr addrspace(3) noundef @use_of_local_var.x) #[[ATTR5]]
++// CHECK-NEXT:    call void @generic_ptr(ptr noundef addrspacecast (ptr addrspace(3) @use_of_local_var.x to ptr)) #[[ATTR5]]
++// CHECK-NEXT:    ret void
++//
++__kernel void use_of_local_var()
++{
++    __local int x;
++    local_ptr(&x);
++    generic_ptr(&x);
++}
++
++//.
++// CHECK: [[TBAA4]] = !{[[META5:![0-9]+]], [[META5]], i64 0}
++// CHECK: [[META5]] = !{!"int", [[META6:![0-9]+]], i64 0}
++// CHECK: [[META6]] = !{!"omnipotent char", [[META7:![0-9]+]], i64 0}
++// CHECK: [[META7]] = !{!"Simple C/C++ TBAA"}
++// CHECK: [[META8]] = !{}
++//.
+-- 
+2.34.1
+
diff --git a/third_party/llvm/workspace.bzl b/third_party/llvm/workspace.bzl
new file mode 100644
index 00000000000000..e07ffee1d7a9e8
--- /dev/null
+++ b/third_party/llvm/workspace.bzl
@@ -0,0 +1,28 @@
+"""Provides the repository macro to import LLVM."""
+
+load("//third_party:repo.bzl", "tf_http_archive")
+
+def repo(name):
+    """Imports LLVM."""
+    LLVM_COMMIT = "f8287f6c373fcf993643dd6f0e30dde304c1be73"
+    LLVM_SHA256 = "add2841174abc79c45aa309bdf0cf631aa8f97e7a4df57dcfca57c60df27527f"
+
+    tf_http_archive(
+        name = name,
+        sha256 = LLVM_SHA256,
+        strip_prefix = "llvm-project-{commit}".format(commit = LLVM_COMMIT),
+        urls = [
+            "https://storage.googleapis.com/mirror.tensorflow.org/github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
+            "https://github.com/llvm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT),
+        ],
+        build_file = "//third_party/llvm:llvm.BUILD",
+        patch_file = [
+            "//third_party/llvm:generated.patch",  # Autogenerated, don't remove.
+            "//third_party/llvm:build.patch",
+            "//third_party/llvm:mathextras.patch",
+            "//third_party/llvm:toolchains.patch",
+            "//third_party/llvm:zstd.patch",
+            "//third_party/llvm:0001-clang-CodeGen-sret-args-should-always-point-to-the-a.patch",
+        ],
+        link_files = {"//third_party/llvm:run_lit.sh": "mlir/run_lit.sh"},
+    )
diff --git a/third_party/xla/third_party/rocm_device_libs/BUILD b/third_party/xla/third_party/rocm_device_libs/BUILD
new file mode 100644
index 00000000000000..cc558415f2b3d2
--- /dev/null
+++ b/third_party/xla/third_party/rocm_device_libs/BUILD
@@ -0,0 +1 @@
+# copybara:uncomment package(default_applicable_licenses = ["//third_party/tensorflow:license"])
diff --git a/third_party/xla/third_party/rocm_device_libs/build_defs.bzl b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
new file mode 100644
index 00000000000000..b69cfa2ab88290
--- /dev/null
+++ b/third_party/xla/third_party/rocm_device_libs/build_defs.bzl
@@ -0,0 +1,96 @@
+load("@bazel_skylib//lib:paths.bzl", "paths")
+
+def bitcode_library(
+        name,
+        srcs = [],
+        hdrs = [],
+        file_specific_flags = {},
+        **kwargs
+):
+    """Builds a bitcode library
+
+    Args:
+        name: Unique name of the build rule.
+        srcs: List of source files (*.cl, *.ll).
+        hdrs: List of header files (*.h).
+        file_specific_flags: Per-file dict of flags to be passed to clang.
+        **kwargs: Attributes relevant for a common rule.
+    """
+    clang_tool = "@llvm-project//clang:clang"
+    clang_include = "@llvm-raw//:clang/lib/Headers"
+    llvm_link_tool = "@llvm-project//llvm:llvm-link"
+    opt_tool = "@llvm-project//llvm:opt"
+    prepare_builtins_tool = ":prepare_builtins"
+
+    include_paths = dict([(paths.dirname(h), None) for h in hdrs]).keys()
+    includes = " ".join(["-I$(location {})".format(inc) for inc in include_paths])
+    flags = ("-fcolor-diagnostics -Werror -Wno-error=atomic-alignment -x cl -Xclang " +
+             "-cl-std=CL2.0 --target=amdgcn-amd-amdhsa -fvisibility=hidden -fomit-frame-pointer " +
+             "-Xclang -finclude-default-header -Xclang -fexperimental-strict-floating-point " +
+             "-Xclang -fdenormal-fp-math=dynamic -Xclang -Qn " +
+             "-nogpulib -cl-no-stdinc -Xclang -mcode-object-version=none")
+
+    link_inputs = []
+
+    for src in srcs:
+        filename = paths.basename(src)
+        (basename, _, ext) = filename.partition(".")
+
+        if (ext == "ll"):
+            link_inputs.append(src)
+            continue
+
+        out = basename + ".bc"
+        link_inputs.append(out)
+        extra_flags = " ".join(file_specific_flags.get(filename,[]))
+        native.genrule(
+            name = "compile_" + basename,
+            srcs = [src] + hdrs + include_paths,
+            outs = [out],
+            #TODO(rocm): Ugly hack to access bultin clang includes.
+            cmd = "$(location {}) -I$(execpath {}).runfiles/llvm-project/clang/staging/include/  {} {} {} -emit-llvm -c $(location {}) -o $@".format(
+                    clang_tool, clang_tool, includes, flags, extra_flags, src),
+            tools = [clang_tool],
+            message = "Compiling {} ...".format(filename),
+        )
+
+    link_message = "Linking {}.bc ...".format(name)
+
+    prelink_out = name + ".link0.lib.bc"
+    native.genrule(
+        name = "prelink_" + name,
+        srcs = link_inputs,
+        outs = [prelink_out],
+        cmd = "$(location {}) $(SRCS) -o $@".format(llvm_link_tool),
+        tools = [llvm_link_tool],
+        message = link_message,
+    )
+
+    internalize_out = name + ".lib.bc"
+    native.genrule(
+        name = "internalize_" + name,
+        srcs = [prelink_out],
+        outs = [internalize_out],
+        cmd = "$(location {}) -internalize -only-needed $< -o $@".format(llvm_link_tool),
+        tools = [llvm_link_tool],
+        message = link_message,
+    )
+
+    strip_out = name + ".strip.bc"
+    native.genrule(
+        name = "strip_" + name,
+        srcs = [internalize_out],
+        outs = [strip_out],
+        cmd = "$(location {}) -passes=amdgpu-unify-metadata,strip -o $@ $<".format(opt_tool),
+        tools = [opt_tool],
+        message = link_message,
+    )
+
+    native.genrule(
+        name = name,
+        srcs = [strip_out],
+        outs = [name + ".bc"],
+        cmd = "$(location {}) -o $@ $<".format(prepare_builtins_tool),
+        tools = [prepare_builtins_tool],
+        message = link_message,
+    )
\ No newline at end of file
diff --git a/third_party/xla/third_party/rocm_device_libs/prepare_builtins.patch b/third_party/xla/third_party/rocm_device_libs/prepare_builtins.patch
new file mode 100644
index 00000000000000..c24b38a910ba72
--- /dev/null
+++ b/third_party/xla/third_party/rocm_device_libs/prepare_builtins.patch
@@ -0,0 +1,18 @@
+diff --git a/utils/prepare-builtins/prepare-builtins.cpp b/utils/prepare-builtins/prepare-builtins.cpp
+index 7fc9d06dab7d..2a93638c3f8f 100644
+--- a/utils/prepare-builtins/prepare-builtins.cpp
++++ b/utils/prepare-builtins/prepare-builtins.cpp
+@@ -73,6 +73,13 @@ int main(int argc, char **argv) {
+     return 1;
+   }
+
++  // Strip the OpenCL version metadata. There are a lot of linked
++  // modules in the library build, each spamming the same
++  // version. This may also report a different version than the user
++  // program is using. This should probably be uniqued when linking.
++  if (NamedMDNode *OCLVersion = M->getNamedMetadata("opencl.ocl.version"))
++      M->eraseNamedMetadata(OCLVersion);
++
+   // Set linkage of every external definition to linkonce_odr.
+   for (Module::iterator i = M->begin(), e = M->end(); i != e; ++i) {
+     if (!i->isDeclaration() && i->getLinkage() == GlobalValue::ExternalLinkage) {
diff --git a/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
new file mode 100644
index 00000000000000..eaf2e5917d2a15
--- /dev/null
+++ b/third_party/xla/third_party/rocm_device_libs/rocm_device_libs.BUILD
@@ -0,0 +1,62 @@
+load("build_defs.bzl", "bitcode_library")
+
+licenses(["notice"])
+
+package(default_visibility = ["//visibility:public"])
+
+exports_files([
+    "LICENSE.TXT",
+])
+
+cc_binary(
+    name = "prepare_builtins",
+    srcs = glob([
+        "utils/prepare-builtins/*.cpp",
+        "utils/prepare-builtins/*.h",
+    ]),
+    copts = [
+        "-fno-rtti -fno-exceptions",
+    ],
+    deps = [
+        "@llvm-project//llvm:BitReader",
+        "@llvm-project//llvm:BitWriter",
+        "@llvm-project//llvm:Core",
+        "@llvm-project//llvm:IRReader",
+        "@llvm-project//llvm:Support",
+    ],
+    visibility = ["//visibility:private"],
+)
+
+bitcode_library(
+    name = "ocml",
+    srcs = glob([
+        "ocml/src/*.cl"
+    ]),
+    hdrs = glob([
+        "ocml/src/*.h",
+        "ocml/inc/*.h",
+        "irif/inc/*.h",
+        "oclc/inc/*.h",
+    ]),
+    file_specific_flags = {
+        "native_logF.cl": ["-fapprox-func"],
+        "native_expF.cl": ["-fapprox-func"],
+        "sqrtF.cl": ["-cl-fp32-correctly-rounded-divide-sqrt"],
+    },
+)
+
+bitcode_library(
+    name = "ockl",
+    srcs = glob([
+        "ockl/src/*.cl",
+        "ockl/src/*.ll",
+    ]),
+    hdrs = glob([
+        "ockl/inc/*.h",
+        "irif/inc/*.h",
+        "oclc/inc/*.h",
+    ]),
+    file_specific_flags = {
+        "gaaf.cl": ["-munsafe-fp-atomics"],
+    },
+)
diff --git a/third_party/xla/third_party/rocm_device_libs/workspace.bzl b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
new file mode 100644
index 00000000000000..bf810030874ced
--- /dev/null
+++ b/third_party/xla/third_party/rocm_device_libs/workspace.bzl
@@ -0,0 +1,22 @@
+"""Provides the repository macro to import Rocm-Device-Libs"""
+
+load("//third_party:repo.bzl", "tf_http_archive", "tf_mirror_urls")
+
+def repo():
+    """Imports Rocm-Device-Libs."""
+    LLVM_COMMIT = "0cf1859d038376421b4cd597e3df90d37cfca06e"
+    LLVM_SHA256 = "0374d1efa0f049d2d1c24c4d86029b006cb5594cc0a1b6a18c49fb094c29cd29"
+
+    tf_http_archive(
+        name = "rocm_device_libs",
+        sha256 = LLVM_SHA256,
+        strip_prefix = "llvm-project-{commit}/amd/device-libs".format(commit = LLVM_COMMIT),
+        urls = tf_mirror_urls("https://github.com/ROCm/llvm-project/archive/{commit}.tar.gz".format(commit = LLVM_COMMIT)),
+        build_file = "//third_party/rocm_device_libs:rocm_device_libs.BUILD",
+        patch_file = [
+            "//third_party/rocm_device_libs:prepare_builtins.patch",
+        ],
+        link_files = {
+            "//third_party/rocm_device_libs:build_defs.bzl": "build_defs.bzl",
+        },
+    )
diff --git a/third_party/xla/workspace2.bzl b/third_party/xla/workspace2.bzl
index 9e4166034d062b..b0c5d814afd3d3 100644
--- a/third_party/xla/workspace2.bzl
+++ b/third_party/xla/workspace2.bzl
@@ -36,6 +36,7 @@ load("//third_party/py:python_configure.bzl", "python_configure")
 load("//third_party/py/ml_dtypes:workspace.bzl", ml_dtypes = "repo")
 load("//third_party/pybind11_abseil:workspace.bzl", pybind11_abseil = "repo")
 load("//third_party/pybind11_bazel:workspace.bzl", pybind11_bazel = "repo")
+load("//third_party/rocm_device_libs:workspace.bzl", rocm_device_libs = "repo")
 load("//third_party/robin_map:workspace.bzl", robin_map = "repo")
 load("//third_party/shardy:workspace.bzl", shardy = "repo")
 load("//third_party/stablehlo:workspace.bzl", stablehlo = "repo")
@@ -72,6 +73,7 @@ def _initialize_third_party():
     nvshmem()
     pybind11_abseil()
     pybind11_bazel()
+    rocm_device_libs()
     robin_map()
     shardy()
     stablehlo()
diff --git a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
index 43dd221e3ca370..93b5bc23f5fa38 100644
--- a/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
+++ b/third_party/xla/xla/backends/gpu/codegen/triton/compilation_pipeline_rocm.cc
@@ -160,12 +160,8 @@ absl::Status CreateTritonPipeline(
 }
 
 std::string GetLibdevicePath(const HloModuleConfig& hlo_config,
-  const se::DeviceDescription& device_info) {
-  std::string libdevice_dir = tsl::RocdlRoot();
-  auto compute_capability = device_info.rocm_compute_capability();
-  const std::string libdevice_path =
-      amdgpu::LibDevicePath(compute_capability.gcn_arch_name(), libdevice_dir);
-  return libdevice_path;
+                             const se::DeviceDescription& device_info) {
+  return "__builtin__";
 }
 
 }  // namespace gpu
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
index 31a599dc75f0cd..fc0dc0119b51de 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/BUILD
@@ -5,6 +5,7 @@ load(
     "if_oss",
     "internal_visibility",
 )
+load("//xla:strict.default.bzl", "py_strict_binary")
 
 package(
     # copybara:uncomment default_applicable_licenses = ["//tensorflow:license"],
@@ -128,6 +129,31 @@ cc_library(
     ],
 )
 
+py_strict_binary(
+    name = "generate_amdgpu_device_lib_data_tool",
+    srcs = ["generate_amdgpu_device_lib_data_tool.py"],
+)
+
+genrule(
+    name = "generate_amdgpu_device_lib_data",
+    srcs = [
+        "@rocm_device_libs//:ockl",
+        "@rocm_device_libs//:ocml",
+    ],
+    outs = ["amdgpu_device_lib_data.inc"],
+    cmd = "$(location {}) --llvm_link_bin $(location {}) $(SRCS) -o $@".format(
+            ":generate_amdgpu_device_lib_data_tool", "@llvm-project//llvm:llvm-link"),
+    tools = [":generate_amdgpu_device_lib_data_tool", "@llvm-project//llvm:llvm-link"],
+)
+
+cc_library(
+    name = "amdgpu_device_lib_data",
+    hdrs = [
+        ":generate_amdgpu_device_lib_data",
+    ],
+    include_prefix = ".",
+)
+
 cc_library(
     name = "amdgpu_backend",
     srcs = [
@@ -138,6 +164,7 @@ cc_library(
     ],
     local_defines = if_oss(["HAS_SUPPORT_FOR_LLD_AS_A_LIBRARY=1"]),
     deps = [
+        ":amdgpu_device_lib_data",
         ":llvm_gpu_backend",
         ":load_ir_module",
         "//xla:util",
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
index acb00ff57bfafd..b05c253d1e8ceb 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.cc
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
-    http://www.apache.org/licenses/LICENSE-2.0
+  http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -100,34 +100,17 @@ namespace {
 
 // Inline threshold value to use in LLVM AMDGPU backend.
 const int kAMDGPUInlineThreshold = 0x100000;
+const int32_t kAMDGPUAbiVersion = 500;
 
 // Gets the ROCm-Device-Libs filenames for a particular AMDGPU version.
-std::vector<std::string> GetROCDLPaths(std::string gcn_arch_name,
-                                       const std::string& rocdl_dir_path) {
-  // AMDGPU version-neutral bitcodes.
-  static std::vector<std::string>* rocdl_filenames =
-      new std::vector<std::string>(
-          {"opencl.bc", "ocml.bc", "ockl.bc", "oclc_finite_only_off.bc",
-           "oclc_daz_opt_off.bc", "oclc_correctly_rounded_sqrt_on.bc",
-           "oclc_unsafe_math_off.bc", "oclc_wavefrontsize64_on.bc",
-           "oclc_abi_version_500.bc"});
-
+std::vector<std::string> GetROCDLPaths(const std::string& rocdl_dir_path) {
   // Construct full path to ROCDL bitcode libraries.
   std::vector<std::string> result;
-  result.reserve(rocdl_filenames->size() + 1);
-  for (auto& filename : *rocdl_filenames) {
-    result.push_back(tsl::io::JoinPath(rocdl_dir_path, filename));
+  result.reserve(2);
+  for (absl::string_view filename : {"ocml.bc", "ockl.bc"}) {
+    result.emplace_back(tsl::io::JoinPath(rocdl_dir_path, filename));
   }
 
-  // Add AMDGPU version-specific bitcodes.
-  std::vector<std::string> tokens = absl::StrSplit(gcn_arch_name, ':');
-  std::string amdgpu_version = gcn_arch_name;
-  if (!tokens.empty() && tokens[0].size() >= 3) {
-    amdgpu_version = tokens[0].substr(3);
-  }
-  result.push_back(tsl::io::JoinPath(
-      rocdl_dir_path,
-      absl::StrCat("oclc_isa_version_", amdgpu_version, ".bc")));
   return result;
 }
 
@@ -332,14 +315,103 @@ absl::StatusOr<std::vector<uint8_t>> EmitModuleToHsaco(
 
 // Links ROCm-Device-Libs into the given module if the module needs it.
 absl::Status LinkROCDLIfNecessary(llvm::Module* module,
-                                  std::string gcn_arch_name,
+                                  const std::string& gfx_version,
+                                  const DebugOptions& debug_options,
                                   const std::string& rocdl_dir_path) {
   if (!CouldNeedDeviceBitcode(*module)) {
     return absl::OkStatus();
   }
 
-  return LinkWithBitcodeVector(module,
-                               GetROCDLPaths(gcn_arch_name, rocdl_dir_path));
+  auto addControlVariable = [&](llvm::StringRef name, uint32_t value,
+                                uint32_t bitwidth = 8) {
+    if (module->getNamedGlobal(name)) return;
+    llvm::IntegerType* type =
+        llvm::IntegerType::getIntNTy(module->getContext(), bitwidth);
+    llvm::GlobalVariable* control_variable = new llvm::GlobalVariable(
+        *module, type, /*isConstant=*/true,
+        llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,
+        llvm::ConstantInt::get(type, value), name, /*before=*/nullptr,
+        /*threadLocalMode=*/llvm::GlobalValue::ThreadLocalMode::NotThreadLocal,
+        /*addressSpace=*/4);
+    control_variable->setVisibility(
+        llvm::GlobalValue::VisibilityTypes::ProtectedVisibility);
+    control_variable->setAlignment(llvm::MaybeAlign(bitwidth / 8));
+    control_variable->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Local);
+    VLOG(2) << "addControlVariable " << name.data() << " " << value;
+  };
+
+  addControlVariable("__oclc_finite_only_opt", false);
+  // TODO(rocm): Maybe check ftz for this one
+  addControlVariable("__oclc_daz_opt", false);
+  addControlVariable("__oclc_correctly_rounded_sqrt32", true);
+  addControlVariable("__oclc_unsafe_math_opt", false);
+
+  // TODO(rocm): Move this into device_description.h or use llvm infra
+  CHECK((gfx_version[3] == '9' && gfx_version.size() == 6) ||
+        (gfx_version[3] == '1' && gfx_version.size() == 7));
+
+  uint32_t major, stepping, minor;
+
+  if (gfx_version[3] == '9') {
+    major = 9;
+    CHECK(absl::SimpleAtoi({&gfx_version[4], 1}, &stepping));
+    CHECK(absl::SimpleHexAtoi({&gfx_version[5], 1}, &minor));
+  } else {
+    CHECK(absl::SimpleAtoi({&gfx_version[3], 2}, &major));
+    CHECK(absl::SimpleAtoi({&gfx_version[5], 1}, &stepping));
+    CHECK(absl::SimpleAtoi({&gfx_version[6], 1}, &minor));
+  }
+
+  // TODO(rocm): Not great, not terrible
+  addControlVariable("__oclc_wavefrontsize64", major == 9);
+  addControlVariable("__oclc_ISA_version",
+                     1000 * major + 100 * stepping + minor, 32);
+  addControlVariable("__oclc_ABI_version", kAMDGPUAbiVersion, 32);
+
+
+  static bool use_embeded_device_lib = []() {
+    bool embeded_device_lib = false;
+    TF_CHECK_OK(tsl::ReadBoolFromEnvVar("TF_ROCM_EMBEDED_DEVICE_LIB",
+                                        /*default_val=*/true, &embeded_device_lib));
+    return embeded_device_lib;
+  }();
+
+  if (use_embeded_device_lib) {
+    static const char device_lib_data[] = {
+#include "amdgpu_device_lib_data.inc"
+    };
+
+    llvm::Linker linker(*module);
+    auto device_lib = llvm::getLazyBitcodeModule(
+        {llvm::StringRef{device_lib_data, sizeof(device_lib_data)},
+         "device_lib"},
+        module->getContext());
+    if (!device_lib) {
+      return xla::Internal("Error loading embeded device lib.");
+    }
+    if (linker.linkInModule(
+            std::move(*device_lib), llvm::Linker::Flags::LinkOnlyNeeded,
+            [](llvm::Module& M, const llvm::StringSet<>& GVS) {
+              internalizeModule(M, [&GVS](const llvm::GlobalValue& GV) {
+                return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+              });
+            })) {
+      return xla::Internal("Error linking embeded device lib.");
+    }
+    return absl::OkStatus();
+  }
+
+  TF_RETURN_IF_ERROR(
+      LinkWithBitcodeVector(module, GetROCDLPaths(rocdl_dir_path)));
+
+  // Sanitize stray metadata from the bitcode files
+  if (auto* opencl_version = module->getNamedMetadata("opencl.ocl.version"))
+    module->eraseNamedMetadata(opencl_version);
+
+  if (auto* ident = module->getNamedMetadata("llvm.ident"))
+    module->eraseNamedMetadata(ident);
+
+  return absl::OkStatus();
 }
 
 absl::Status AMDGPUTargetModuleLinker(
@@ -354,9 +426,9 @@ absl::Status AMDGPUTargetModuleLinker(
     return xla::Internal("Incompatible compute capability was specified.");
   }
 
-  std::string gcn_arch_name = compute_capability->gcn_arch_name();
   TF_RETURN_IF_ERROR(
-      LinkROCDLIfNecessary(module, gcn_arch_name, device_bitcode_dir_path));
+      LinkROCDLIfNecessary(module, compute_capability->gfx_version(),
+                           debug_options, device_bitcode_dir_path));
 
   // If ftz is enabled, set it as an attribute on every function in the module.
   if (debug_options.xla_gpu_ftz()) {
@@ -364,9 +436,9 @@ absl::Status AMDGPUTargetModuleLinker(
       fn.addFnAttr("denormal-fp-math-f32", "preserve-sign");
     }
   }
-  const int32_t kAbiVersion = 500;
+
   module->addModuleFlag(llvm::Module::Error, "amdhsa_code_object_version",
-                        kAbiVersion);
+                        kAMDGPUAbiVersion);
 
   return absl::OkStatus();
 }
@@ -493,17 +565,6 @@ std::vector<std::string> GetAMDGPUBackendOptions(
   return backend_llvm_opts;
 }
 
-std::string LibDevicePath(std::string gcn_arch_name,
-                          const std::string& rocdl_dir_path) {
-  auto libdevice_dir_paths = GetROCDLPaths(gcn_arch_name, rocdl_dir_path);
-  for (auto libdevice_dir_path : libdevice_dir_paths) {
-    if (libdevice_dir_path.find("ocml.bc")) {
-      return libdevice_dir_path;
-    }
-  }
-  return "";
-}
-
 absl::StatusOr<std::vector<uint8_t>> CompileToHsaco(
     llvm::Module* module, se::GpuComputeCapability gpu_version,
     const DebugOptions& debug_options,
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h
index 878edd52a0b981..6baa0be9d9e869 100644
--- a/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/amdgpu_backend.h
@@ -27,9 +27,6 @@ limitations under the License.
 #include "xla/xla.pb.h"
 
 namespace xla::gpu::amdgpu {
-// Get path to libdevice file.
-std::string LibDevicePath(std::string gcn_arch_name,
-                          const std::string& rocdl_dir_path);
 // Compiles the argument module and returns it with LLVM AMDGPU backend.
 // rocdl_dir_path is the parent directory of ROCm-Device-Libs bitcode libraries.
 // The contents of the module may be changed.
diff --git a/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
new file mode 100644
index 00000000000000..f06299c00ab113
--- /dev/null
+++ b/third_party/xla/xla/service/gpu/llvm_gpu_backend/generate_amdgpu_device_lib_data_tool.py
@@ -0,0 +1,42 @@
+import argparse
+import subprocess
+import itertools
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--llvm_link_bin", required=True, help="Path to the llvm-link binary"
+    )
+    parser.add_argument(
+        "-o", "--output", required=True, help="Output filename for the C header"
+    )
+    parser.add_argument(
+        "input_files", nargs="+", help="Variable number of input filenames"
+    )
+
+    args = parser.parse_args()
+    llvm_link_bin = args.llvm_link_bin
+    output_filename = args.output
+    input_filenames = args.input_files
+
+    result = subprocess.run(
+        [llvm_link_bin, "-f", "-o", "-", "/dev/null"]
+        + list(
+            itertools.chain.from_iterable(("--override", f) for f in input_filenames)
+        ),
+        capture_output=True,
+        check=True,
+    )
+
+    llvm_output = result.stdout
+    hex_string = ",".join(
+        str(byte if byte < 128 else byte - 256) for byte in llvm_output
+    )
+
+    with open(output_filename, "w") as output_file:
+        output_file.write(hex_string)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/third_party/xla/xla/strict.default.bzl b/third_party/xla/xla/strict.default.bzl
new file mode 100644
index 00000000000000..2042d4a98d05fb
--- /dev/null
+++ b/third_party/xla/xla/strict.default.bzl
@@ -0,0 +1,13 @@
+"""Default (OSS) build versions of Python strict rules."""
+
+# Placeholder to use until bazel supports py_strict_binary.
+def py_strict_binary(name, **kwargs):
+    native.py_binary(name = name, **kwargs)
+
+# Placeholder to use until bazel supports py_strict_library.
+def py_strict_library(name, **kwargs):
+    native.py_library(name = name, **kwargs)
+
+# Placeholder to use until bazel supports py_strict_test.
+def py_strict_test(name, **kwargs):
+    native.py_test(name = name, **kwargs)