-
Notifications
You must be signed in to change notification settings - Fork 15k
[AMDGPU] Add support for v_log_bf16
on gfx1250
#149201
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Conversation
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This stack of pull requests is managed by Graphite. Learn more about stacking. |
@llvm/pr-subscribers-mc @llvm/pr-subscribers-backend-amdgpu Author: Shilei Tian (shiltian) ChangesCo-authored-by: Mekhanoshin, Stanislav <[email protected]> Patch is 79.65 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149201.diff 25 Files Affected:
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index a80f571140666..eee0a94f6fc64 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -672,6 +672,7 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_tensorcnt, "vIUs", "n", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_tanh_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rcp_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_rsq_bf16, "yy", "nc", "bf16-trans-insts")
+TARGET_BUILTIN(__builtin_amdgcn_log_bf16, "yy", "nc", "bf16-trans-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_fp8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 8d227a5f957c8..0312205d4ff8d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -436,6 +436,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_dispatch_ptr:
return EmitAMDGPUDispatchPtr(*this, E);
case AMDGPU::BI__builtin_amdgcn_logf:
+ case AMDGPU::BI__builtin_amdgcn_log_bf16:
return emitBuiltinWithOneOverloadedType<1>(*this, E, Intrinsic::amdgcn_log);
case AMDGPU::BI__builtin_amdgcn_exp2f:
return emitBuiltinWithOneOverloadedType<1>(*this, E,
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index 8b7ec143a2e00..bdf169a1a97da 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -99,6 +99,25 @@ void test_rsq_bf16(global __bf16* out, __bf16 a)
*out = __builtin_amdgcn_rsq_bf16(a);
}
+// CHECK-LABEL: @test_log_bf16(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[A_ADDR:%.*]] = alloca bfloat, align 2, addrspace(5)
+// CHECK-NEXT: [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT: [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT: store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store bfloat [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP0:%.*]] = load bfloat, ptr [[A_ADDR_ASCAST]], align 2
+// CHECK-NEXT: [[TMP1:%.*]] = call bfloat @llvm.amdgcn.log.bf16(bfloat [[TMP0]])
+// CHECK-NEXT: [[TMP2:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT: store bfloat [[TMP1]], ptr addrspace(1) [[TMP2]], align 2
+// CHECK-NEXT: ret void
+//
+void test_log_bf16(global __bf16* out, __bf16 a)
+{
+ *out = __builtin_amdgcn_log_bf16(a);
+}
+
// CHECK-LABEL: @test_cvt_f16_fp8(
// CHECK-NEXT: entry:
// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 6f8437e82700e..e1bc39302e126 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -532,6 +532,7 @@ defm V_TANH_BF16 : VOP1Inst_t16 <"v_tanh_bf16", VOP_BF16_BF16, int_amdgcn_tanh>;
defm V_RCP_BF16 : VOP1Inst_t16 <"v_rcp_bf16", VOP_BF16_BF16, AMDGPUrcp>;
defm V_SQRT_BF16 : VOP1Inst_t16 <"v_sqrt_bf16", VOP_BF16_BF16, any_amdgcn_sqrt>;
defm V_RSQ_BF16 : VOP1Inst_t16 <"v_rsq_bf16", VOP_BF16_BF16, AMDGPUrsq>;
+defm V_LOG_BF16 : VOP1Inst_t16 <"v_log_bf16", VOP_BF16_BF16, AMDGPUlogf16>;
}
} // End TRANS = 1, SchedRW = [WriteTrans32]
defm V_FREXP_MANT_F16 : VOP1Inst_t16 <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
@@ -1143,6 +1144,7 @@ defm V_CVT_F16_BF8 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x078>;
defm V_RCP_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x079>;
defm V_SQRT_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07a>;
defm V_RSQ_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07b>;
+defm V_LOG_BF16 : VOP1_Real_FULL_t16_and_fake16_gfx1250<0x07c>;
//===----------------------------------------------------------------------===//
// GFX10.
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-math.ll b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
new file mode 100644
index 0000000000000..05eee2d4d549d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-math.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck --check-prefix=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+
+define amdgpu_ps void @llvm_log2_bf16_v(ptr addrspace(1) %out, bfloat %src) {
+; GCN-LABEL: llvm_log2_bf16_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_log_bf16_e32 v2, v2
+; GCN-NEXT: global_store_b16 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %log = call bfloat @llvm.log2.bf16(bfloat %src)
+ store bfloat %log, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+define amdgpu_ps void @llvm_log2_bf16_s(ptr addrspace(1) %out, bfloat inreg %src) {
+; GCN-LABEL: llvm_log2_bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_log_bf16_e32 v2, s0
+; GCN-NEXT: global_store_b16 v[0:1], v2, off
+; GCN-NEXT: s_endpgm
+ %log = call bfloat @llvm.log2.bf16(bfloat %src)
+ store bfloat %log, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+declare bfloat @llvm.log2.bf16(bfloat)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
new file mode 100644
index 0000000000000..a8b2077f5a35b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.log.bf16.ll
@@ -0,0 +1,33 @@
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefix=GCN %s
+
+; FIXME: GlobalISel does not work with bf16
+
+declare bfloat @llvm.amdgcn.log.bf16(bfloat) #0
+
+; GCN-LABEL: {{^}}log_bf16:
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, {{s[0-9]+}}
+define amdgpu_kernel void @log_bf16(ptr addrspace(1) %out, bfloat %src) #1 {
+ %log = call bfloat @llvm.amdgcn.log.bf16(bfloat %src) #0
+ store bfloat %log, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_4
+; GCN: v_log_bf16_e32 v0, 4.0
+define amdgpu_kernel void @log_bf16_constant_4(ptr addrspace(1) %out) #1 {
+ %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 4.0) #0
+ store bfloat %log, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+; GCN-LABEL: {{^}}log_bf16_constant_100
+; GCN: v_log_bf16_e32 {{v[0-9]+}}, 0x42c8
+define amdgpu_kernel void @log_bf16_constant_100(ptr addrspace(1) %out) #1 {
+ %log = call bfloat @llvm.amdgcn.log.bf16(bfloat 100.0) #0
+ store bfloat %log, ptr addrspace(1) %out, align 2
+ ret void
+}
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
new file mode 100644
index 0000000000000..5bd9fa6f23aa0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.bf16.ll
@@ -0,0 +1,240 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-SDAG-FAKE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=+real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-TRUE16 %s
+; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1250 -mattr=-real-true16 %s -o - | FileCheck -check-prefix=GFX-GISEL-FAKE16 %s
+
+define bfloat @v_log2_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %result = call bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %result
+}
+
+define bfloat @v_log2_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, |v0.l|
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, |v0|
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+ %result = call bfloat @llvm.log2.bf16(bfloat %fabs)
+ ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_fabs_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -|v0.l|
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -|v0|
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call bfloat @llvm.fabs.bf16(bfloat %in)
+ %fneg.fabs = fneg bfloat %fabs
+ %result = call bfloat @llvm.log2.bf16(bfloat %fneg.fabs)
+ ret bfloat %result
+}
+
+define bfloat @v_log2_fneg_bf16(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fneg = fneg bfloat %in
+ %result = call bfloat @llvm.log2.bf16(bfloat %fneg)
+ ret bfloat %result
+}
+
+define bfloat @v_log2_bf16_fast(bfloat %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_bf16_fast:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %result = call fast bfloat @llvm.log2.bf16(bfloat %in)
+ ret bfloat %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT: v_nop
+; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v1.l
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v2.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fabs_v2bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT: v_nop
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+ %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fabs)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_fabs_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-TRUE16-NEXT: v_bfe_u32 v2, v0, 16, 15
+; GFX-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v1.l
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v2.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_fabs_v2bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff7fff, v0
+; GFX-SDAG-FAKE16-NEXT: v_bfe_u32 v0, v0, 16, 15
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT: v_nop
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fabs = call <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat> %in)
+ %fneg.fabs = fneg <2 x bfloat> %fabs
+ %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg.fabs)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_fneg_v2bf16(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.h, -v0.h
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e64 v0.l, -v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_fneg_v2bf16:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v0, -v0
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e64 v1, -v1
+; GFX-SDAG-FAKE16-NEXT: v_nop
+; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %fneg = fneg <2 x bfloat> %in
+ %result = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %fneg)
+ ret <2 x bfloat> %result
+}
+
+define <2 x bfloat> @v_log2_v2bf16_fast(<2 x bfloat> %in) {
+; GFX-SDAG-TRUE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-TRUE16: ; %bb.0:
+; GFX-SDAG-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.h, v0.h
+; GFX-SDAG-TRUE16-NEXT: v_log_bf16_e32 v0.l, v0.l
+; GFX-SDAG-TRUE16-NEXT: s_set_pc_i64 s[30:31]
+;
+; GFX-SDAG-FAKE16-LABEL: v_log2_v2bf16_fast:
+; GFX-SDAG-FAKE16: ; %bb.0:
+; GFX-SDAG-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0
+; GFX-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v0, v0
+; GFX-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(TRANS32_DEP_1)
+; GFX-SDAG-FAKE16-NEXT: v_log_bf16_e32 v1, v1
+; GFX-SDAG-FAKE16-NEXT: v_nop
+; GFX-SDAG-FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x5040100
+; GFX-SDAG-FAKE16-NEXT: s_set_pc_i64 s[30:31]
+ %result = call fast <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> %in)
+ ret <2 x bfloat> %result
+}
+
+declare bfloat @llvm.log2.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat>) #0
+declare bfloat @llvm.fabs.bf16(bfloat) #0
+declare <2 x bfloat> @llvm.fabs.v2bf16(<2 x bfloat>) #0
+
+attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
index 467418874592a..0f5ce56f1a2cf 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop1-fake16.s
@@ -208,6 +208,51 @@ v_rsq_bf16 v5, src_scc
v_rsq_bf16 v127, 0x8000
// GFX1250: v_rsq_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf6,0xfe,0x7e,0x00,0x80,0x00,0x00]
+v_log_bf16 v5, v1
+// GFX1250: v_log_bf16_e32 v5, v1 ; encoding: [0x01,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, v127
+// GFX1250: v_log_bf16_e32 v5, v127 ; encoding: [0x7f,0xf9,0x0a,0x7e]
+
+v_log_bf16 v5, s1
+// GFX1250: v_log_bf16_e32 v5, s1 ; encoding: [0x01,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, s105
+// GFX1250: v_log_bf16_e32 v5, s105 ; encoding: [0x69,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_lo
+// GFX1250: v_log_bf16_e32 v5, vcc_lo ; encoding: [0x6a,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, vcc_hi
+// GFX1250: v_log_bf16_e32 v5, vcc_hi ; encoding: [0x6b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, ttmp15
+// GFX1250: v_log_bf16_e32 v5, ttmp15 ; encoding: [0x7b,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, m0
+// GFX1250: v_log_bf16_e32 v5, m0 ; encoding: [0x7d,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_lo
+// GFX1250: v_log_bf16_e32 v5, exec_lo ; encoding: [0x7e,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, exec_hi
+// GFX1250: v_log_bf16_e32 v5, exec_hi ; encoding: [0x7f,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, null
+// GFX1250: v_log_bf16_e32 v5, null ; encoding: [0x7c,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, -1
+// GFX1250: v_log_bf16_e32 v5, -1 ; encoding: [0xc1,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, 0.5
+// GFX1250: v_log_bf16_e32 v5, 0.5 ; encoding: [0xf0,0xf8,0x0a,0x7e]
+
+v_log_bf16 v5, src_scc
+// GFX1250: v_log_bf16_e32 v5, src_scc ; encoding: [0xfd,0xf8,0x0a,0x7e]
+
+v_log_bf16 v127, 0x8000
+// GFX1250: v_log_bf16_e32 v127, 0x8000 ; encoding: [0xff,0xf8,0xfe,0x7e,0x00,0x80,0x00,0x00]
+
v_cvt_f32_bf16 v5, v1
// GFX1250: v_cvt_f32_bf16_e32 v5, v1 ; encoding: [0x01,0xe5,0x0a,0x7e]
diff --git a/llvm/test/MC/AMDGPU/gfx1250...
[truncated]
|
rampitec
approved these changes
Jul 16, 2025
2960778
to
2cc4091
Compare
Co-authored-by: Mekhanoshin, Stanislav <[email protected]>
0604616
to
00b2b5c
Compare
This was referenced Jul 23, 2025
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Labels
backend:AMDGPU
clang:codegen
IR generation bugs: mangling, exceptions, etc.
clang:frontend
Language frontend issues, e.g. anything involving "Sema"
clang
Clang issues not falling into any other category
llvm:mc
Machine (object) code
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Co-authored-by: Mekhanoshin, Stanislav [email protected]