diff --git a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
index 05e1a7431975f..02e6ed3c32f2c 100644
--- a/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
+++ b/llvm/lib/SYCLLowerIR/GlobalOffset.cpp
@@ -59,11 +59,21 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() {
   return new GlobalOffsetLegacy();
 }
 
+// Recursive helper function to collect Loads from GEPs in a BFS fashion.
+static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed,
+                     SmallVectorImpl<LoadInst *> &Loads) {
+  Traversed.push_back(P);
+  if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion
+    Loads.push_back(L);
+  else {
+    assert(isa<GetElementPtrInst>(*P));
+    for (Value *V : P->users())
+      getLoads(cast<Instruction>(V), Traversed, Loads);
+  }
+}
+
 // New PM implementation.
 PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
-  if (!EnableGlobalOffset)
-    return PreservedAnalyses::all();
-
   AT = TargetHelpers::getArchType(M);
   Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName(
       AT == ArchType::Cuda
@@ -73,33 +83,62 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) {
   if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty())
     return PreservedAnalyses::all();
 
-  // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
-  // happy with ADDRESS_SPACE_GENERIC (0).
-  TargetAS = AT == ArchType::Cuda ? 0 : 5;
-  /// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
-  /// to be 3, use it for the implicit argument pointer type.
-  KernelImplicitArgumentType =
-      ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
-  ImplicitOffsetPtrType =
-      Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
-  assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
-         "Implicit offset intrinsic does not return the expected type");
+  if (!EnableGlobalOffset) {
+    SmallVector<CallInst *, 4> Worklist;
+    SmallVector<LoadInst *, 4> LI;
+    SmallVector<Instruction *, 4> PtrUses;
 
-  SmallVector<KernelPayload, 4> KernelPayloads;
-  TargetHelpers::populateKernels(M, KernelPayloads, AT);
+    // Collect all GEPs and Loads from the intrinsic's CallInsts
+    for (Value *V : ImplicitOffsetIntrinsic->users()) {
+      Worklist.push_back(cast<CallInst>(V));
+      for (Value *V2 : V->users())
+        getLoads(cast<Instruction>(V2), PtrUses, LI);
+    }
+
+    // Replace each use of a collected Load with a Constant 0
+    for (LoadInst *L : LI)
+      L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0));
 
-  // Validate kernels and populate entry map
-  EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
+    // Remove all collected Loads and GEPs from the kernel.
+    // PtrUses is returned by `getLoads` in topological order.
+    // Walk it backwards so we don't violate users.
+    for (auto *I : reverse(PtrUses))
+      I->eraseFromParent();
 
-  // Add implicit parameters to all direct and indirect users of the offset
-  addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+    // Remove all collected CallInsts from the kernel.
+    for (CallInst *CI : Worklist) {
+      auto *I = cast<Instruction>(CI);
+      I->eraseFromParent();
+    }
+  } else {
+    // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is
+    // happy with ADDRESS_SPACE_GENERIC (0).
+    TargetAS = AT == ArchType::Cuda ? 0 : 5;
+    /// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen
+    /// to be 3, use it for the implicit argument pointer type.
+    KernelImplicitArgumentType =
+        ArrayType::get(Type::getInt32Ty(M.getContext()), 3);
+    ImplicitOffsetPtrType =
+        Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS);
+    assert(
+        (ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) &&
+        "Implicit offset intrinsic does not return the expected type");
+
+    SmallVector<KernelPayload, 4> KernelPayloads;
+    TargetHelpers::populateKernels(M, KernelPayloads, AT);
+
+    // Validate kernels and populate entry map
+    EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads);
+
+    // Add implicit parameters to all direct and indirect users of the offset
+    addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr);
+  }
 
   // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete
   // it.
   assert(ImplicitOffsetIntrinsic->use_empty() &&
          "Not all uses of intrinsic removed");
   ImplicitOffsetIntrinsic->eraseFromParent();
-
   return PreservedAnalyses::none();
 }
 
diff --git a/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
new file mode 100644
index 0000000000000..ccc06cf248a68
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/global-offset-removal.ll
@@ -0,0 +1,20 @@
+; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
+
+; This test checks that the implicit offset intrinsic is correctly removed
+
+declare ptr addrspace(5) @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: llvm.amdgcn.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14example_kernel() {
+entry:
+; CHECK-NOT: @llvm.amdgcn.implicit.offset()
+; CHECK-NOT: getelementptr
+; CHECK-NOT: load
+; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
+; CHECK: ret i{{[0-9]+}} [[REG]]
+  %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset()
+  %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1
+  %2 = load i32, ptr addrspace(5) %1, align 4
+  %3 = zext i32 %2 to i64
+  ret i64 %3
+}
diff --git a/llvm/test/CodeGen/NVPTX/global-offset-removal.ll b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll
new file mode 100644
index 0000000000000..da116feede474
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-offset-removal.ll
@@ -0,0 +1,21 @@
+; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s
+target triple = "nvptx64-nvidia-cuda"
+
+; This test checks that the implicit offset intrinsic is correctly removed
+
+declare ptr @llvm.nvvm.implicit.offset()
+; CHECK-NOT: llvm.nvvm.implicit.offset
+
+define weak_odr dso_local i64 @_ZTS14example_kernel() {
+entry:
+; CHECK-NOT: @llvm.nvvm.implicit.offset()
+; CHECK-NOT: getelementptr
+; CHECK-NOT: load
+; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}}
+; CHECK: ret i{{[0-9]+}} [[REG]]
+  %0 = tail call ptr @llvm.nvvm.implicit.offset()
+  %1 = getelementptr inbounds i32, ptr %0, i64 1
+  %2 = load i32, ptr %1, align 4
+  %3 = zext i32 %2 to i64
+  ret i64 %3
+}