- 
                Notifications
    You must be signed in to change notification settings 
- Fork 791
[SYCL][CUDA][HIP] Fix enable-global-offset flag #11674
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
a208542
              1ed65a7
              c9d9d1f
              e7ade02
              638c4c4
              96bdba9
              be1ccba
              0fef339
              File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
|  | @@ -59,11 +59,21 @@ ModulePass *llvm::createGlobalOffsetPassLegacy() { | |
| return new GlobalOffsetLegacy(); | ||
| } | ||
|  | ||
| // Recursive helper function to collect Loads from GEPs in a BFS fashion. | ||
| static void getLoads(Instruction *P, SmallVectorImpl<Instruction *> &Traversed, | ||
| SmallVectorImpl<LoadInst *> &Loads) { | ||
| Traversed.push_back(P); | ||
| There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. May be an assert at the very top of this function body to check for *p being either a load or a GEP might be better? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean the check is already taking place in the  | ||
| if (auto *L = dyn_cast<LoadInst>(P)) // Base case for recursion | ||
| Loads.push_back(L); | ||
| else { | ||
| assert(isa<GetElementPtrInst>(*P)); | ||
|         
                  MartinWehking marked this conversation as resolved.
              Show resolved
            Hide resolved | ||
| for (Value *V : P->users()) | ||
| getLoads(cast<Instruction>(V), Traversed, Loads); | ||
| } | ||
| } | ||
|  | ||
| // New PM implementation. | ||
| PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { | ||
| if (!EnableGlobalOffset) | ||
| return PreservedAnalyses::all(); | ||
|  | ||
| AT = TargetHelpers::getArchType(M); | ||
| Function *ImplicitOffsetIntrinsic = M.getFunction(Intrinsic::getName( | ||
| AT == ArchType::Cuda | ||
|  | @@ -73,33 +83,62 @@ PreservedAnalyses GlobalOffsetPass::run(Module &M, ModuleAnalysisManager &) { | |
| if (!ImplicitOffsetIntrinsic || ImplicitOffsetIntrinsic->use_empty()) | ||
| return PreservedAnalyses::all(); | ||
|  | ||
| // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is | ||
| // happy with ADDRESS_SPACE_GENERIC (0). | ||
| TargetAS = AT == ArchType::Cuda ? 0 : 5; | ||
| /// The value for NVVM's ADDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen | ||
| /// to be 3, use it for the implicit argument pointer type. | ||
| KernelImplicitArgumentType = | ||
| ArrayType::get(Type::getInt32Ty(M.getContext()), 3); | ||
| ImplicitOffsetPtrType = | ||
| Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); | ||
| assert((ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && | ||
| "Implicit offset intrinsic does not return the expected type"); | ||
| if (!EnableGlobalOffset) { | ||
| SmallVector<CallInst *, 4> Worklist; | ||
| SmallVector<LoadInst *, 4> LI; | ||
| SmallVector<Instruction *, 4> PtrUses; | ||
|  | ||
| SmallVector<KernelPayload, 4> KernelPayloads; | ||
| TargetHelpers::populateKernels(M, KernelPayloads, AT); | ||
| // Collect all GEPs and Loads from the intrinsic's CallInsts | ||
| for (Value *V : ImplicitOffsetIntrinsic->users()) { | ||
|         
                  MartinWehking marked this conversation as resolved.
              Outdated
          
            Show resolved
            Hide resolved | ||
| Worklist.push_back(cast<CallInst>(V)); | ||
| for (Value *V2 : V->users()) | ||
| getLoads(cast<Instruction>(V2), PtrUses, LI); | ||
| } | ||
|  | ||
| // Replace each use of a collected Load with a Constant 0 | ||
| for (LoadInst *L : LI) | ||
| L->replaceAllUsesWith(ConstantInt::get(L->getType(), 0)); | ||
|  | ||
| // Validate kernels and populate entry map | ||
| EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); | ||
| // Remove all collected Loads and GEPs from the kernel. | ||
| // PtrUses is returned by `getLoads` in topological order. | ||
| // Walk it backwards so we don't violate users. | ||
| for (auto *I : reverse(PtrUses)) | ||
| I->eraseFromParent(); | ||
|  | ||
| // Add implicit parameters to all direct and indirect users of the offset | ||
| addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); | ||
| // Remove all collected CallInsts from the kernel. | ||
| for (CallInst *CI : Worklist) { | ||
| auto *I = cast<Instruction>(CI); | ||
| I->eraseFromParent(); | ||
| } | ||
| } else { | ||
| // For AMD allocas and pointers have to be to CONSTANT_PRIVATE (5), NVVM is | ||
| // happy with ADDRESS_SPACE_GENERIC (0). | ||
| TargetAS = AT == ArchType::Cuda ? 0 : 5; | ||
| /// The value for NVVM's adDRESS_SPACE_SHARED and AMD's LOCAL_ADDRESS happen | ||
| /// to be 3, use it for the implicit argument pointer type. | ||
| KernelImplicitArgumentType = | ||
| ArrayType::get(Type::getInt32Ty(M.getContext()), 3); | ||
| ImplicitOffsetPtrType = | ||
| Type::getInt32Ty(M.getContext())->getPointerTo(TargetAS); | ||
| assert( | ||
| (ImplicitOffsetIntrinsic->getReturnType() == ImplicitOffsetPtrType) && | ||
| "Implicit offset intrinsic does not return the expected type"); | ||
|  | ||
| SmallVector<KernelPayload, 4> KernelPayloads; | ||
| TargetHelpers::populateKernels(M, KernelPayloads, AT); | ||
|  | ||
| // Validate kernels and populate entry map | ||
| EntryPointMetadata = generateKernelMDNodeMap(M, KernelPayloads); | ||
|  | ||
| // Add implicit parameters to all direct and indirect users of the offset | ||
| addImplicitParameterToCallers(M, ImplicitOffsetIntrinsic, nullptr); | ||
| } | ||
|  | ||
| // Assert that all uses of `ImplicitOffsetIntrinsic` are removed and delete | ||
| // it. | ||
| assert(ImplicitOffsetIntrinsic->use_empty() && | ||
| "Not all uses of intrinsic removed"); | ||
| ImplicitOffsetIntrinsic->eraseFromParent(); | ||
|  | ||
| return PreservedAnalyses::none(); | ||
| } | ||
|  | ||
|  | ||
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,20 @@ | ||
| ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s | ||
|  | ||
| ; This test checks that the implicit offset intrinsic is correctly removed | ||
|  | ||
| declare ptr addrspace(5) @llvm.amdgcn.implicit.offset() | ||
| ; CHECK-NOT: llvm.amdgcn.implicit.offset | ||
|  | ||
| define weak_odr dso_local i64 @_ZTS14example_kernel() { | ||
| entry: | ||
| ; CHECK-NOT: @llvm.amdgcn.implicit.offset() | ||
| ; CHECK-NOT: getelementptr | ||
| ; CHECK-NOT: load | ||
| ; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} | ||
| ; CHECK: ret i{{[0-9]+}} [[REG]] | ||
| %0 = tail call ptr addrspace(5) @llvm.amdgcn.implicit.offset() | ||
| %1 = getelementptr inbounds i32, ptr addrspace(5) %0, i64 1 | ||
| %2 = load i32, ptr addrspace(5) %1, align 4 | ||
| %3 = zext i32 %2 to i64 | ||
| ret i64 %3 | ||
| } | 
| Original file line number | Diff line number | Diff line change | 
|---|---|---|
| @@ -0,0 +1,21 @@ | ||
| ; RUN: opt -bugpoint-enable-legacy-pm -globaloffset -enable-global-offset=false %s -S -o - | FileCheck %s | ||
| target triple = "nvptx64-nvidia-cuda" | ||
|  | ||
| ; This test checks that the implicit offset intrinsic is correctly removed | ||
|  | ||
| declare ptr @llvm.nvvm.implicit.offset() | ||
| ; CHECK-NOT: llvm.nvvm.implicit.offset | ||
|  | ||
| define weak_odr dso_local i64 @_ZTS14example_kernel() { | ||
| entry: | ||
| ; CHECK-NOT: @llvm.nvvm.implicit.offset() | ||
| ; CHECK-NOT: getelementptr | ||
| ; CHECK-NOT: load | ||
| ; CHECK: [[REG:%[0-9]+]] = zext i{{[0-9]+}} 0 to i{{[0-9]+}} | ||
| ; CHECK: ret i{{[0-9]+}} [[REG]] | ||
| %0 = tail call ptr @llvm.nvvm.implicit.offset() | ||
| %1 = getelementptr inbounds i32, ptr %0, i64 1 | ||
| %2 = load i32, ptr %1, align 4 | ||
| %3 = zext i32 %2 to i64 | ||
| ret i64 %3 | ||
| } | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: it was a bit confusing with the name 'Traversed'. Can we not just call this also as 'PtrUses'?
Also, the function name seems a bit incomplete. May be getLoadsAndGEPs?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I prefer the name traversed here honestly, since it is capturing the approach that this function follows a bit better.
It does a BFS for finding
LoadsandGEPs while keeping track of the already traversed functions.The name focuses on the goal of this function: To collect the Loads in a small vector.
The traversal of GEPs could be more seen as a byproduct.