-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU] Intrinsic for launching whole wave functions #145859
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
34 commits
Select commit
Hold shift + click to select a range
49f9f87
Add subtarget feature
rovka 64a3d2f
[AMDGPU] ISel & PEI for whole wave functions
rovka 47594fd
Use MF instead of MBB
rovka 08ef43e
Revert "Add subtarget feature"
rovka 02d7aec
Add new CC. Do nothing
rovka e4b378f
Replace SubtargetFeature with CallingConv
rovka 97ba693
Enable gisel in tests
rovka 7b44133
GISel support
rovka 2d2f85b
Rename pseudo to match others
rovka fb6d20a
Rename CC
rovka 81adaba
Fix formatting
rovka 9931578
Update tests after merge
rovka 7b75dff
Fix bug in testcase
rovka de8c395
Test inreg args
rovka 5b5e137
Add docs and fixme
rovka 1123374
Remove kill flags on orig exec mask
rovka 339d2c7
Add helper to add orig exec to return
rovka 154c430
Test with single use of orig exec
rovka 843136a
Test calling gfx func from wwf
rovka 9e3d9f2
Test wave64
rovka f6f9337
Fix a few missed spots
rovka 639fb8c
clang-format
rovka f19a8df
Fix CC in test
rovka 846aa2b
Verifier checks for whole wave funcs
rovka 9bed239
[AMDGPU] Intrinsic for launching whole wave functions
rovka 974e0fb
Remove Verifier check that I moved to previous PR
rovka 8dc9461
Remove embarrassing cast
rovka f487cb4
Address review comments
rovka b104da3
Merge remote-tracking branch 'remotes/origin/main' into users/rovka/w…
rovka 3e6b02c
Merge branch 'main' into users/rovka/whole-wave-funcs-call
rovka 197b56d
Fixup merge mishap
rovka 906f8ca
Merge remote-tracking branch 'remotes/origin/main' into users/rovka/w…
rovka 7a850d8
s/size != 0/empty
rovka 6d9c46f
Address review comments
rovka File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,174 @@ | ||
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 | ||
; RUN: llc -global-isel=0 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=DAGISEL | ||
; RUN: llc -global-isel=1 -mtriple=amdgcn--amdpal -mcpu=gfx1200 < %s | FileCheck %s --check-prefix=GISEL | ||
|
||
declare amdgpu_gfx_whole_wave i32 @good_callee(i1 %active, i32 %x, i32 %y, i32 inreg %c) | ||
|
||
define amdgpu_gfx void @basic_test(i32 %x, i32 inreg %c, ptr addrspace(1) %ptr) { | ||
; DAGISEL-LABEL: basic_test: | ||
; DAGISEL: ; %bb.0: | ||
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
; DAGISEL-NEXT: s_wait_expcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_samplecnt 0x0 | ||
; DAGISEL-NEXT: s_wait_bvhcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_kmcnt 0x0 | ||
; DAGISEL-NEXT: s_mov_b32 s0, s33 | ||
; DAGISEL-NEXT: s_mov_b32 s33, s32 | ||
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; DAGISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; DAGISEL-NEXT: v_writelane_b32 v42, s0, 2 | ||
; DAGISEL-NEXT: s_clause 0x1 | ||
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4 | ||
; DAGISEL-NEXT: scratch_store_b32 off, v41, s33 | ||
; DAGISEL-NEXT: v_dual_mov_b32 v41, v2 :: v_dual_mov_b32 v40, v1 | ||
; DAGISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0 | ||
; DAGISEL-NEXT: v_writelane_b32 v42, s30, 0 | ||
; DAGISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi | ||
; DAGISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo | ||
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16 | ||
; DAGISEL-NEXT: v_writelane_b32 v42, s31, 1 | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] | ||
; DAGISEL-NEXT: global_store_b32 v[40:41], v0, off | ||
; DAGISEL-NEXT: s_clause 0x1 | ||
; DAGISEL-NEXT: scratch_load_b32 v41, off, s33 | ||
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 | ||
; DAGISEL-NEXT: v_readlane_b32 s31, v42, 1 | ||
; DAGISEL-NEXT: v_readlane_b32 s30, v42, 0 | ||
; DAGISEL-NEXT: s_mov_b32 s32, s33 | ||
; DAGISEL-NEXT: v_readlane_b32 s0, v42, 2 | ||
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; DAGISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; DAGISEL-NEXT: s_mov_b32 s33, s0 | ||
; DAGISEL-NEXT: s_wait_loadcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-LABEL: basic_test: | ||
; GISEL: ; %bb.0: | ||
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
; GISEL-NEXT: s_wait_expcnt 0x0 | ||
; GISEL-NEXT: s_wait_samplecnt 0x0 | ||
; GISEL-NEXT: s_wait_bvhcnt 0x0 | ||
; GISEL-NEXT: s_wait_kmcnt 0x0 | ||
; GISEL-NEXT: s_mov_b32 s0, s33 | ||
; GISEL-NEXT: s_mov_b32 s33, s32 | ||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; GISEL-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; GISEL-NEXT: v_writelane_b32 v42, s0, 2 | ||
; GISEL-NEXT: s_clause 0x1 | ||
; GISEL-NEXT: scratch_store_b32 off, v40, s33 offset:4 | ||
; GISEL-NEXT: scratch_store_b32 off, v41, s33 | ||
; GISEL-NEXT: v_dual_mov_b32 v40, v1 :: v_dual_mov_b32 v41, v2 | ||
; GISEL-NEXT: v_add_nc_u32_e32 v1, 13, v0 | ||
; GISEL-NEXT: v_writelane_b32 v42, s30, 0 | ||
; GISEL-NEXT: s_mov_b32 s0, good_callee@abs32@lo | ||
; GISEL-NEXT: s_mov_b32 s1, good_callee@abs32@hi | ||
; GISEL-NEXT: s_add_co_i32 s32, s32, 16 | ||
; GISEL-NEXT: v_writelane_b32 v42, s31, 1 | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] | ||
; GISEL-NEXT: global_store_b32 v[40:41], v0, off | ||
; GISEL-NEXT: s_clause 0x1 | ||
; GISEL-NEXT: scratch_load_b32 v41, off, s33 | ||
; GISEL-NEXT: scratch_load_b32 v40, off, s33 offset:4 | ||
; GISEL-NEXT: v_readlane_b32 s31, v42, 1 | ||
; GISEL-NEXT: v_readlane_b32 s30, v42, 0 | ||
; GISEL-NEXT: s_mov_b32 s32, s33 | ||
; GISEL-NEXT: v_readlane_b32 s0, v42, 2 | ||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; GISEL-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; GISEL-NEXT: s_mov_b32 s33, s0 | ||
; GISEL-NEXT: s_wait_loadcnt 0x0 | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_setpc_b64 s[30:31] | ||
%y = add i32 %x, 13 | ||
%ret = call i32(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @good_callee, i32 %x, i32 %y, i32 inreg %c) | ||
store i32 %ret, ptr addrspace(1) %ptr | ||
ret void | ||
} | ||
|
||
declare amdgpu_gfx_whole_wave void @void_callee(i1 %active, i32 %x) | ||
|
||
define amdgpu_gfx void @ret_void(i32 %x) { | ||
; DAGISEL-LABEL: ret_void: | ||
; DAGISEL: ; %bb.0: | ||
; DAGISEL-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
; DAGISEL-NEXT: s_wait_expcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_samplecnt 0x0 | ||
; DAGISEL-NEXT: s_wait_bvhcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_kmcnt 0x0 | ||
; DAGISEL-NEXT: s_mov_b32 s0, s33 | ||
; DAGISEL-NEXT: s_mov_b32 s33, s32 | ||
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; DAGISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; DAGISEL-NEXT: v_writelane_b32 v40, s0, 2 | ||
; DAGISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi | ||
; DAGISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo | ||
; DAGISEL-NEXT: s_add_co_i32 s32, s32, 16 | ||
; DAGISEL-NEXT: v_writelane_b32 v40, s30, 0 | ||
; DAGISEL-NEXT: v_writelane_b32 v40, s31, 1 | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] | ||
; DAGISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||
; DAGISEL-NEXT: v_readlane_b32 s31, v40, 1 | ||
; DAGISEL-NEXT: v_readlane_b32 s30, v40, 0 | ||
; DAGISEL-NEXT: s_mov_b32 s32, s33 | ||
; DAGISEL-NEXT: v_readlane_b32 s0, v40, 2 | ||
; DAGISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; DAGISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; DAGISEL-NEXT: s_mov_b32 s33, s0 | ||
; DAGISEL-NEXT: s_wait_loadcnt 0x0 | ||
; DAGISEL-NEXT: s_wait_alu 0xfffe | ||
; DAGISEL-NEXT: s_setpc_b64 s[30:31] | ||
; | ||
; GISEL-LABEL: ret_void: | ||
; GISEL: ; %bb.0: | ||
; GISEL-NEXT: s_wait_loadcnt_dscnt 0x0 | ||
; GISEL-NEXT: s_wait_expcnt 0x0 | ||
; GISEL-NEXT: s_wait_samplecnt 0x0 | ||
; GISEL-NEXT: s_wait_bvhcnt 0x0 | ||
; GISEL-NEXT: s_wait_kmcnt 0x0 | ||
; GISEL-NEXT: s_mov_b32 s0, s33 | ||
; GISEL-NEXT: s_mov_b32 s33, s32 | ||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; GISEL-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; GISEL-NEXT: v_writelane_b32 v40, s0, 2 | ||
; GISEL-NEXT: s_mov_b32 s0, void_callee@abs32@lo | ||
; GISEL-NEXT: s_mov_b32 s1, void_callee@abs32@hi | ||
; GISEL-NEXT: s_add_co_i32 s32, s32, 16 | ||
; GISEL-NEXT: v_writelane_b32 v40, s30, 0 | ||
; GISEL-NEXT: v_writelane_b32 v40, s31, 1 | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_swappc_b64 s[30:31], s[0:1] | ||
; GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) | ||
; GISEL-NEXT: v_readlane_b32 s31, v40, 1 | ||
; GISEL-NEXT: v_readlane_b32 s30, v40, 0 | ||
; GISEL-NEXT: s_mov_b32 s32, s33 | ||
; GISEL-NEXT: v_readlane_b32 s0, v40, 2 | ||
; GISEL-NEXT: s_or_saveexec_b32 s1, -1 | ||
; GISEL-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_mov_b32 exec_lo, s1 | ||
; GISEL-NEXT: s_mov_b32 s33, s0 | ||
; GISEL-NEXT: s_wait_loadcnt 0x0 | ||
; GISEL-NEXT: s_wait_alu 0xfffe | ||
; GISEL-NEXT: s_setpc_b64 s[30:31] | ||
call void(ptr, ...) @llvm.amdgcn.call.whole.wave(ptr @void_callee, i32 %x) | ||
ret void | ||
} | ||
|
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.