From 48f1915d6ee71a1c125c05b9359dd72158312e91 Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Tue, 14 Oct 2025 02:56:44 -0700 Subject: [PATCH 1/3] Add register bank legalization for G_FADD --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 3 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 11 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 4 + llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll | 246 ++++++++++++++++++ 4 files changed, 263 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 540756653dd22..198ee6b73b0b5 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -837,6 +837,7 @@ LLT RegBankLegalizeHelper::getTyFromID(RegBankLLTMappingApplyID ID) { return LLT::scalar(32); case Sgpr64: case Vgpr64: + case UniInVgprS64: return LLT::scalar(64); case Sgpr128: case Vgpr128: @@ -960,6 +961,7 @@ RegBankLegalizeHelper::getRegBankFromID(RegBankLLTMappingApplyID ID) { case UniInVcc: case UniInVgprS16: case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: case UniInVgprB32: @@ -1092,6 +1094,7 @@ void RegBankLegalizeHelper::applyMappingDst( break; } case UniInVgprS32: + case UniInVgprS64: case UniInVgprV2S16: case UniInVgprV4S32: { assert(Ty == getTyFromID(MethodIDs[OpIdx])); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index a67b12a22589c..0b6decc699ccb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -906,9 +906,18 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, bool hasSALUFloat = ST->hasSALUFloatInsts(); addRulesForGOpcs({G_FADD}, Standard) + .Uni(S16, {{UniInVgprS16}, {Vgpr16, Vgpr16}}, !hasSALUFloat) + .Uni(S16, {{Sgpr16}, {Sgpr16, Sgpr16}}, hasSALUFloat) + .Div(S16, {{Vgpr16}, {Vgpr16, Vgpr16}}) .Uni(S32, {{Sgpr32}, {Sgpr32, Sgpr32}}, hasSALUFloat) .Uni(S32, {{UniInVgprS32}, {Vgpr32, Vgpr32}}, !hasSALUFloat) - .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}); + .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) + .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) + .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) + .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); addRulesForGOpcs({G_FPTOUI}) .Any({{UniS32, S32}, {{Sgpr32}, {Sgpr32}}}, hasSALUFloat) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 93e0efda77fdd..1cf9ae2e226ca 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -92,8 +92,10 @@ enum UniformityLLTOpPredicateID { V4S32, UniV2S16, + UniV2S32, DivV2S16, + DivV2S32, // B types B32, @@ -178,7 +180,9 @@ enum RegBankLLTMappingApplyID { UniInVcc, UniInVgprS16, UniInVgprS32, + UniInVgprS64, UniInVgprV2S16, + UniInVgprV2S32, UniInVgprV4S32, UniInVgprB32, UniInVgprB64, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll new file mode 100644 index 0000000000000..ec221496f450c --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -0,0 +1,246 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s +; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s + +define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { +; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps half @fadd_s16_div(half %a, half %b) { +; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div: +; GFX11-SDAG-FAKE16: ; %bb.0: +; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div: +; GFX11-SDAG-TRUE16: ; %bb.0: +; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div: +; GFX11-GISEL-FAKE16: ; %bb.0: +; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div: +; GFX11-GISEL-TRUE16: ; %bb.0: +; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div: +; GFX12-SDAG-FAKE16: ; %bb.0: +; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div: +; GFX12-SDAG-TRUE16: ; %bb.0: +; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div: +; GFX12-GISEL-FAKE16: ; %bb.0: +; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div: +; GFX12-GISEL-TRUE16: ; %bb.0: +; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog + %fadd = fadd half %a, %b + ret half %fadd +} + +define amdgpu_ps float @fadd_s32_uniform(float inreg %a, float inreg %b) { +; GFX11-LABEL: fadd_s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f32_e64 v0, s0, s1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_add_f32 s0, s0, s1 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps float @fadd_s32_div(float %a, float %b) { +; GCN-LABEL: fadd_s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_add_f32_e32 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog + %fadd = fadd float %a, %b + ret float %fadd +} + +define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) { +; GFX11-LABEL: fadd_s64_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s64_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: s_wait_alu 0xf1ff +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd double %a, %b + ret double %fadd +} + +define amdgpu_ps double @fadd_s64_div(double %a, double %b) { +; GFX11-LABEL: fadd_s64_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog +; +; GFX12-LABEL: fadd_s64_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_readfirstlane_b32 s0, v0 +; GFX12-NEXT: v_readfirstlane_b32 s1, v1 +; GFX12-NEXT: ; return to shader part epilog + %fadd = fadd double %a, %b + ret double %fadd +} + +define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { +; GFX11-LABEL: fadd_v2s16_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, s0, s1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fadd_v2s16_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v0, s0, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { +; GFX11-LABEL: fadd_v2s16_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fadd_v2s16_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_pk_add_f16 v0, v0, v1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fadd = fadd <2 x half> %a, %b + ret <2 x half> %fadd +} + +define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { +; GFX11-LABEL: fadd_v2s32_uniform: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_add_f32_e64 v0, s0, s2 +; GFX11-NEXT: v_add_f32_e64 v1, s1, s3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fadd_v2s32_uniform: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: s_add_f32 s0, s0, s2 +; GFX12-NEXT: s_add_f32 s1, s1, s3 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} + +define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { +; GFX11-LABEL: fadd_v2s32_div: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX11-NEXT: s_setpc_b64 s[30:31] +; +; GFX12-LABEL: fadd_v2s32_div: +; GFX12: ; %bb.0: +; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 +; GFX12-NEXT: s_wait_expcnt 0x0 +; GFX12-NEXT: s_wait_samplecnt 0x0 +; GFX12-NEXT: s_wait_bvhcnt 0x0 +; GFX12-NEXT: s_wait_kmcnt 0x0 +; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GFX12-NEXT: s_setpc_b64 s[30:31] + %fadd = fadd <2 x float> %a, %b + ret <2 x float> %fadd +} +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX11-GISEL: {{.*}} +; GFX11-SDAG: {{.*}} +; GFX12-GISEL: {{.*}} +; GFX12-SDAG: {{.*}} From cda47e5221beb14b945f1db391d04726b789744c Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Thu, 30 Oct 2025 10:02:54 +0000 Subject: [PATCH 2/3] Address review comments: Scalarize v2s16 for uniform operation and implement combine logic for ReadAnyLane + Trunc + AnyExt. --- .../Target/AMDGPU/AMDGPURegBankLegalize.cpp | 21 +- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 19 ++ .../AMDGPU/AMDGPURegBankLegalizeHelper.h | 2 + .../AMDGPU/AMDGPURegBankLegalizeRules.cpp | 4 +- .../AMDGPU/AMDGPURegBankLegalizeRules.h | 1 + llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll | 184 ++++++------------ 6 files changed, 99 insertions(+), 132 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp index e1879598f098a..907f8300de6d2 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalize.cpp @@ -24,6 +24,7 @@ #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/CSEMIRBuilder.h" #include "llvm/CodeGen/GlobalISel/GenericMachineInstrs.h" +#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineUniformityAnalysis.h" @@ -34,9 +35,17 @@ using namespace llvm; using namespace AMDGPU; +using namespace llvm::MIPatternMatch; namespace { +// AMDGPU-specific pattern matchers +template +inline UnaryOp_match +m_GAMDGPUReadAnyLane(const SrcTy &Src) { + return UnaryOp_match(Src); +} + class AMDGPURegBankLegalize : public MachineFunctionPass { public: static char ID; @@ -160,10 +169,18 @@ AMDGPURegBankLegalizeCombiner::tryMatchRALFromUnmerge(Register Src) { Register AMDGPURegBankLegalizeCombiner::getReadAnyLaneSrc(Register Src) { // Src = G_AMDGPU_READANYLANE RALSrc - auto [RAL, RALSrc] = tryMatch(Src, AMDGPU::G_AMDGPU_READANYLANE); - if (RAL) + Register RALSrc; + if (mi_match(Src, MRI, m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))) return RALSrc; + // TruncSrc = G_AMDGPU_READANYLANE RALSrc + // AextSrc = G_TRUNC TruncSrc + // Src = G_ANYEXT AextSrc + if (mi_match(Src, MRI, + m_GAnyExt(m_GTrunc(m_GAMDGPUReadAnyLane(m_Reg(RALSrc)))))) { + return RALSrc; + } + // LoVgpr, HiVgpr = G_UNMERGE_VALUES UnmergeSrc // LoSgpr = G_AMDGPU_READANYLANE LoVgpr // HiSgpr = G_AMDGPU_READANYLANE HiVgpr diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 198ee6b73b0b5..30df80fb62a8b 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -616,6 +616,23 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { MI.eraseFromParent(); } +void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { + Register Dst = MI.getOperand(0).getReg(); + assert(MRI.getType(Dst) == V2S16); + auto [Op0Lo32, Op0Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(2).getReg()); + unsigned Opc = MI.getOpcode(); + auto Flags = MI.getFlags(); + auto Op0Lo = B.buildTrunc(SgprRB_S16, Op0Lo32); + auto Op0Hi = B.buildTrunc(SgprRB_S16, Op0Hi32); + auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); + auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op0Lo, Op1Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op0Hi, Op1Hi}, Flags); + B.buildMergeLikeInstr(Dst, {Lo, Hi}); + MI.eraseFromParent(); +} + void RegBankLegalizeHelper::lowerSplitTo32Select(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); LLT DstTy = MRI.getType(Dst); @@ -688,6 +705,8 @@ void RegBankLegalizeHelper::lower(MachineInstr &MI, return lowerUnpackBitShift(MI); case UnpackMinMax: return lowerUnpackMinMax(MI); + case ScalarizeToS16: + return lowerSplitTo16(MI); case Ext32To64: { const RegisterBank *RB = MRI.getRegBank(MI.getOperand(0).getReg()); MachineInstrBuilder Hi; diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h index d937815bf4714..df0d7ef4689fe 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.h @@ -72,6 +72,7 @@ class RegBankLegalizeHelper { static constexpr LLT P6 = LLT::pointer(6, 32); MachineRegisterInfo::VRegAttrs SgprRB_S32 = {SgprRB, S32}; + MachineRegisterInfo::VRegAttrs SgprRB_S16 = {SgprRB, S16}; MachineRegisterInfo::VRegAttrs VgprRB_S32 = {VgprRB, S32}; MachineRegisterInfo::VRegAttrs VccRB_S1 = {VccRB, S1}; @@ -121,6 +122,7 @@ class RegBankLegalizeHelper { void lowerV_BFE(MachineInstr &MI); void lowerS_BFE(MachineInstr &MI); void lowerSplitTo32(MachineInstr &MI); + void lowerSplitTo16(MachineInstr &MI); void lowerSplitTo32Select(MachineInstr &MI); void lowerSplitTo32SExtInReg(MachineInstr &MI); void lowerUnpackMinMax(MachineInstr &MI); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp index 0b6decc699ccb..1b2593a3f30a8 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.cpp @@ -914,7 +914,9 @@ RegBankLegalizeRules::RegBankLegalizeRules(const GCNSubtarget &_ST, .Div(S32, {{Vgpr32}, {Vgpr32, Vgpr32}}) .Uni(S64, {{UniInVgprS64}, {Vgpr64, Vgpr64}}) .Div(S64, {{Vgpr64}, {Vgpr64, Vgpr64}}) - .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}) + .Uni(V2S16, {{UniInVgprV2S16}, {VgprV2S16, VgprV2S16}}, !hasSALUFloat) + .Uni(V2S16, {{SgprV2S16}, {SgprV2S16, SgprV2S16}, ScalarizeToS16}, + hasSALUFloat) .Div(V2S16, {{VgprV2S16}, {VgprV2S16, VgprV2S16}}) .Any({{UniV2S32}, {{UniInVgprV2S32}, {VgprV2S32, VgprV2S32}}}) .Any({{DivV2S32}, {{VgprV2S32}, {VgprV2S32, VgprV2S32}}}); diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h index 1cf9ae2e226ca..007fedc737512 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeRules.h @@ -221,6 +221,7 @@ enum LoweringMethodID { V_BFE, VgprToVccCopy, SplitTo32, + ScalarizeToS16, SplitTo32Select, SplitTo32SExtInReg, Ext32To64, diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll index ec221496f450c..602ec570cce37 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -1,39 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 -; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-SDAG,GFX11-SDAG-TRUE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-FAKE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-GISEL,GFX11-GISEL-TRUE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s -; RUN: llc -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s -; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1100 -o - %s | FileCheck -check-prefixes=GCN,GFX11,GFX11-TRUE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=-real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-FAKE16 %s +; RUN: llc -global-isel -new-reg-bank-select -mtriple=amdgcn-amd-amdpal -mattr=+real-true16 -mcpu=gfx1200 -o - %s | FileCheck -check-prefixes=GCN,GFX12,GFX12-TRUE16 %s define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { -; GFX11-SDAG-FAKE16-LABEL: fadd_s16_uniform: -; GFX11-SDAG-FAKE16: ; %bb.0: -; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 -; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: fadd_s16_uniform: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog ; -; GFX11-SDAG-TRUE16-LABEL: fadd_s16_uniform: -; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 -; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog -; -; GFX11-GISEL-FAKE16-LABEL: fadd_s16_uniform: -; GFX11-GISEL-FAKE16: ; %bb.0: -; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e64 v0, s0, s1 -; GFX11-GISEL-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-GISEL-FAKE16-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog -; -; GFX11-GISEL-TRUE16-LABEL: fadd_s16_uniform: -; GFX11-GISEL-TRUE16: ; %bb.0: -; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 -; GFX11-GISEL-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-GISEL-TRUE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-GISEL-TRUE16-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog +; GFX11-TRUE16-LABEL: fadd_s16_uniform: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e64 v0.l, s0, s1 +; GFX11-TRUE16-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: fadd_s16_uniform: ; GFX12: ; %bb.0: @@ -46,45 +26,25 @@ define amdgpu_ps half @fadd_s16_uniform(half inreg %a, half inreg %b) { } define amdgpu_ps half @fadd_s16_div(half %a, half %b) { -; GFX11-SDAG-FAKE16-LABEL: fadd_s16_div: -; GFX11-SDAG-FAKE16: ; %bb.0: -; GFX11-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: ; return to shader part epilog -; -; GFX11-SDAG-TRUE16-LABEL: fadd_s16_div: -; GFX11-SDAG-TRUE16: ; %bb.0: -; GFX11-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-SDAG-TRUE16-NEXT: ; return to shader part epilog -; -; GFX11-GISEL-FAKE16-LABEL: fadd_s16_div: -; GFX11-GISEL-FAKE16: ; %bb.0: -; GFX11-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX11-GISEL-FAKE16-NEXT: ; return to shader part epilog -; -; GFX11-GISEL-TRUE16-LABEL: fadd_s16_div: -; GFX11-GISEL-TRUE16: ; %bb.0: -; GFX11-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX11-GISEL-TRUE16-NEXT: ; return to shader part epilog -; -; GFX12-SDAG-FAKE16-LABEL: fadd_s16_div: -; GFX12-SDAG-FAKE16: ; %bb.0: -; GFX12-SDAG-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: ; return to shader part epilog -; -; GFX12-SDAG-TRUE16-LABEL: fadd_s16_div: -; GFX12-SDAG-TRUE16: ; %bb.0: -; GFX12-SDAG-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX12-SDAG-TRUE16-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-FAKE16-LABEL: fadd_s16_div: -; GFX12-GISEL-FAKE16: ; %bb.0: -; GFX12-GISEL-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 -; GFX12-GISEL-FAKE16-NEXT: ; return to shader part epilog -; -; GFX12-GISEL-TRUE16-LABEL: fadd_s16_div: -; GFX12-GISEL-TRUE16: ; %bb.0: -; GFX12-GISEL-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l -; GFX12-GISEL-TRUE16-NEXT: ; return to shader part epilog +; GFX11-FAKE16-LABEL: fadd_s16_div: +; GFX11-FAKE16: ; %bb.0: +; GFX11-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX11-FAKE16-NEXT: ; return to shader part epilog +; +; GFX11-TRUE16-LABEL: fadd_s16_div: +; GFX11-TRUE16: ; %bb.0: +; GFX11-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX11-TRUE16-NEXT: ; return to shader part epilog +; +; GFX12-FAKE16-LABEL: fadd_s16_div: +; GFX12-FAKE16: ; %bb.0: +; GFX12-FAKE16-NEXT: v_add_f16_e32 v0, v0, v1 +; GFX12-FAKE16-NEXT: ; return to shader part epilog +; +; GFX12-TRUE16-LABEL: fadd_s16_div: +; GFX12-TRUE16: ; %bb.0: +; GFX12-TRUE16-NEXT: v_add_f16_e32 v0.l, v0.l, v1.l +; GFX12-TRUE16-NEXT: ; return to shader part epilog %fadd = fadd half %a, %b ret half %fadd } @@ -155,92 +115,58 @@ define amdgpu_ps double @fadd_s64_div(double %a, double %b) { ret double %fadd } -define <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { +define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) { ; GFX11-LABEL: fadd_v2s16_uniform: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_pk_add_f16 v0, s0, s1 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: fadd_v2s16_uniform: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v0, s0, s1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: s_lshr_b32 s2, s0, 16 +; GFX12-NEXT: s_lshr_b32 s3, s1, 16 +; GFX12-NEXT: s_add_f16 s0, s0, s1 +; GFX12-NEXT: s_add_f16 s1, s2, s3 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX12-NEXT: s_pack_ll_b32_b16 s0, s0, s1 +; GFX12-NEXT: v_mov_b32_e32 v0, s0 +; GFX12-NEXT: ; return to shader part epilog %fadd = fadd <2 x half> %a, %b ret <2 x half> %fadd } -define <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { -; GFX11-LABEL: fadd_v2s16_div: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_pk_add_f16 v0, v0, v1 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: fadd_v2s16_div: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_pk_add_f16 v0, v0, v1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +define amdgpu_ps <2 x half> @fadd_v2s16_div(<2 x half> %a, <2 x half> %b) { +; GCN-LABEL: fadd_v2s16_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_pk_add_f16 v0, v0, v1 +; GCN-NEXT: ; return to shader part epilog %fadd = fadd <2 x half> %a, %b ret <2 x half> %fadd } -define <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { +define amdgpu_ps <2 x float> @fadd_v2s32_uniform(<2 x float> inreg %a, <2 x float> inreg %b) { ; GFX11-LABEL: fadd_v2s32_uniform: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_f32_e64 v0, s0, s2 ; GFX11-NEXT: v_add_f32_e64 v1, s1, s3 -; GFX11-NEXT: s_setpc_b64 s[30:31] +; GFX11-NEXT: ; return to shader part epilog ; ; GFX12-LABEL: fadd_v2s32_uniform: ; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 ; GFX12-NEXT: s_add_f32 s0, s0, s2 ; GFX12-NEXT: s_add_f32 s1, s1, s3 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_2) +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_3) ; GFX12-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 -; GFX12-NEXT: s_setpc_b64 s[30:31] +; GFX12-NEXT: ; return to shader part epilog %fadd = fadd <2 x float> %a, %b ret <2 x float> %fadd } -define <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { -; GFX11-LABEL: fadd_v2s32_div: -; GFX11: ; %bb.0: -; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 -; GFX11-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-LABEL: fadd_v2s32_div: -; GFX12: ; %bb.0: -; GFX12-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-NEXT: s_wait_expcnt 0x0 -; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: s_wait_bvhcnt 0x0 -; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 -; GFX12-NEXT: s_setpc_b64 s[30:31] +define amdgpu_ps <2 x float> @fadd_v2s32_div(<2 x float> %a, <2 x float> %b) { +; GCN-LABEL: fadd_v2s32_div: +; GCN: ; %bb.0: +; GCN-NEXT: v_dual_add_f32 v0, v0, v2 :: v_dual_add_f32 v1, v1, v3 +; GCN-NEXT: ; return to shader part epilog %fadd = fadd <2 x float> %a, %b ret <2 x float> %fadd } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GFX11-GISEL: {{.*}} -; GFX11-SDAG: {{.*}} -; GFX12-GISEL: {{.*}} -; GFX12-SDAG: {{.*}} From 8298dc3f027fdc8e630e8474f25832d450892c2b Mon Sep 17 00:00:00 2001 From: Abhinav Garg Date: Fri, 31 Oct 2025 06:26:17 +0000 Subject: [PATCH 3/3] Addressing more review comments --- .../AMDGPU/AMDGPURegBankLegalizeHelper.cpp | 12 +++--- llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll | 39 ++++++++----------- 2 files changed, 22 insertions(+), 29 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp index 30df80fb62a8b..4eb7e78c8e43e 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankLegalizeHelper.cpp @@ -619,16 +619,16 @@ void RegBankLegalizeHelper::lowerSplitTo32(MachineInstr &MI) { void RegBankLegalizeHelper::lowerSplitTo16(MachineInstr &MI) { Register Dst = MI.getOperand(0).getReg(); assert(MRI.getType(Dst) == V2S16); - auto [Op0Lo32, Op0Hi32] = unpackAExt(MI.getOperand(1).getReg()); - auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(2).getReg()); + auto [Op1Lo32, Op1Hi32] = unpackAExt(MI.getOperand(1).getReg()); + auto [Op2Lo32, Op2Hi32] = unpackAExt(MI.getOperand(2).getReg()); unsigned Opc = MI.getOpcode(); auto Flags = MI.getFlags(); - auto Op0Lo = B.buildTrunc(SgprRB_S16, Op0Lo32); - auto Op0Hi = B.buildTrunc(SgprRB_S16, Op0Hi32); auto Op1Lo = B.buildTrunc(SgprRB_S16, Op1Lo32); auto Op1Hi = B.buildTrunc(SgprRB_S16, Op1Hi32); - auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op0Lo, Op1Lo}, Flags); - auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op0Hi, Op1Hi}, Flags); + auto Op2Lo = B.buildTrunc(SgprRB_S16, Op2Lo32); + auto Op2Hi = B.buildTrunc(SgprRB_S16, Op2Hi32); + auto Lo = B.buildInstr(Opc, {SgprRB_S16}, {Op1Lo, Op2Lo}, Flags); + auto Hi = B.buildInstr(Opc, {SgprRB_S16}, {Op1Hi, Op2Hi}, Flags); B.buildMergeLikeInstr(Dst, {Lo, Hi}); MI.eraseFromParent(); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll index 602ec570cce37..e440beed1da79 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fadd.ll @@ -74,45 +74,38 @@ define amdgpu_ps float @fadd_s32_div(float %a, float %b) { ret float %fadd } -define amdgpu_ps double @fadd_s64_uniform(double inreg %a, double inreg %b) { +define amdgpu_ps void @fadd_s64_uniform(double inreg %a, double inreg %b, ptr addrspace(1) %ptr) { ; GFX11-LABEL: fadd_s64_uniform: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_add_f64 v[0:1], s[0:1], s[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-NEXT: v_add_f64 v[2:3], s[0:1], s[2:3] +; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fadd_s64_uniform: ; GFX12: ; %bb.0: -; GFX12-NEXT: v_add_f64_e64 v[0:1], s[0:1], s[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: v_readfirstlane_b32 s1, v1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: ; return to shader part epilog +; GFX12-NEXT: v_add_f64_e64 v[2:3], s[0:1], s[2:3] +; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off +; GFX12-NEXT: s_endpgm %fadd = fadd double %a, %b - ret double %fadd + store double %fadd, ptr addrspace(1) %ptr + ret void } -define amdgpu_ps double @fadd_s64_div(double %a, double %b) { +define amdgpu_ps void @fadd_s64_div(double %a, double %b, ptr addrspace(1) %ptr) { ; GFX11-LABEL: fadd_s64_div: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], v[2:3] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-NEXT: v_readfirstlane_b32 s1, v1 -; GFX11-NEXT: ; return to shader part epilog +; GFX11-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: fadd_s64_div: ; GFX12: ; %bb.0: ; GFX12-NEXT: v_add_f64_e32 v[0:1], v[0:1], v[2:3] -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-NEXT: v_readfirstlane_b32 s0, v0 -; GFX12-NEXT: v_readfirstlane_b32 s1, v1 -; GFX12-NEXT: ; return to shader part epilog +; GFX12-NEXT: global_store_b64 v[4:5], v[0:1], off +; GFX12-NEXT: s_endpgm %fadd = fadd double %a, %b - ret double %fadd + store double %fadd, ptr addrspace(1) %ptr + ret void } define amdgpu_ps <2 x half> @fadd_v2s16_uniform(<2 x half> inreg %a, <2 x half> inreg %b) {