diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index 5ad8f9ab07e40..3b29d28552c43 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -796,6 +796,112 @@ every time. For more information, refer PTX ISA Membar/Fences ------------- +'``llvm.nvvm.fence.acquire/release.sync_restrict.*``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +The `nvvm.fence.{semantics}.sync_restrict.*` restrict the class of memory +operations for which the fence instruction provides the memory ordering guarantees. +When `.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. For more details, +please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.mbarrier_init.release.cluster``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.mbarrier_init.release.cluster() + +Overview: +""""""""" + +`nvvm.fence.mbarrier_init.release.cluster` intrinsic restrict the class of +memory operations for which the fence instruction provides the memory ordering +guarantees. The `mbarrier_init` modifiers restricts the synchronizing effect to +the prior `mbarrier_init` operation executed by the same thread on mbarrier objects +in `shared_cta` memory space. For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.async_generic.acquire/release.sync_restrict``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.async.generic.acquire.sync_restrict.space.cluster.scope.cluster() + declare void @llvm.nvvm.fence.proxy.async.generic.release.sync_restrict.space.cta.scope.cluster() + +Overview: +""""""""" + +`nvvm.fence.proxy.async_generic.{semantics}.sync_restrict` are used to establish +ordering between a prior memory access performed via the `async proxy__` +and a subsequent memory access performed via the generic proxy. +``nvvm.fence.proxy.async_generic.release.sync_restrict`` can form a release +sequence that synchronizes with an acquire sequence that contains the +``nvvm.fence.proxy.async_generic.acquire.sync_restrict`` proxy fence. When +`.sync_restrict` is restricted to `shared_cta`, then memory semantics must +be `release` and the effect of the fence operation only applies to operations +performed on objects in `shared_cta` space. Likewise, when `sync_restrict` is +restricted to `shared_cluster`, then memory semantics must be `acquire` and the +effect of the fence operation only applies to operations performed on objects in +`shared_cluster` memory space. The scope for both operations is `cluster`. +For more details, please refer the `PTX ISA `__ + +'``llvm.nvvm.fence.proxy.``' +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +.. code-block:: llvm + + declare void @llvm.nvvm.fence.proxy.alias() + declare void @llvm.nvvm.fence.proxy.async() + declare void @llvm.nvvm.fence.proxy.async.global() + declare void @llvm.nvvm.fence.proxy.async.shared_cluster() + declare void @llvm.nvvm.fence.proxy.async.shared_cta() + +Overview: +""""""""" + +`nvvm.fence.proxy.{proxykind}` intrinsics represent a fence with bi-directional +proxy ordering that is established between the memory accesses done between the +`generic proxy__` +and the proxy specified by `proxykind`. A `bi-directional proxy` ordering between +two proxykinds establishes two `uni-directional` proxy orderings: one from the +first proxykind to the second proxykind and the other from the second proxykind +to the first proxykind. + +`alias` proxykind refers to memory accesses performed using virtually aliased +addresses to the same memory location + +`async` proxykind specifies that the memory ordering is established between the +`async proxy` and the `generic proxy`. The memory ordering is limited only to +operations performed on objects in the state space specified (`generic`, `global`, +`shared_cluster`, `shared_cta`). If no state space is specified, then the memory +ordering applies on all state spaces. For more details, please refer the +`PTX ISA `__ + '``llvm.nvvm.fence.proxy.tensormap_generic.*``' ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 2710853e17688..8404e3bcff859 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -1746,33 +1746,65 @@ let TargetPrefix = "nvvm" in { def int_nvvm_barrier_cluster_wait_aligned : Intrinsic<[]>; } - // - // Membar - // - let IntrProperties = [IntrNoCallback] in { +// +// Membar / Fence +// +let IntrProperties = [IntrNoCallback] in { def int_nvvm_membar_cta : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_gl : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_membar_sys : NVVMBuiltin, Intrinsic<[]>; def int_nvvm_fence_sc_cluster : Intrinsic<[]>; - } - // - // Proxy fence (uni-directional) - // + // Operation fence + def int_nvvm_fence_mbarrier_init_release_cluster: Intrinsic<[], [], [], + "llvm.nvvm.fence.mbarrier_init.release.cluster">; + + // Thread fence + def int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster">; + +// +// Proxy fence (uni-directional) +// + + def int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster">; + + def int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster : + Intrinsic<[], [], [], + "llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster">; + foreach scope = ["cta", "cluster", "gpu", "sys"] in { def int_nvvm_fence_proxy_tensormap_generic_release_ # scope : - Intrinsic<[], [], [IntrNoCallback], + Intrinsic<[], [], [], "llvm.nvvm.fence.proxy.tensormap_generic.release." # scope>; // The imm-arg 'size' can only be 128. def int_nvvm_fence_proxy_tensormap_generic_acquire_ # scope : - Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], - [IntrNoCallback, IntrArgMemOnly, ImmArg>, - Range, 128, 129>], - "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope>; + Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [], + "llvm.nvvm.fence.proxy.tensormap_generic.acquire." # scope> { + let IntrProperties = [IntrNoCallback, IntrArgMemOnly, + ImmArg>, Range, 128, 129>]; + } } +// +// Proxy fence (bi-directional) +// + foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def Intr.record_name: Intrinsic<[], [], [], Intr.intr_name>; + } +} + // // Async Copy // diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 50827bd548ad5..44abb25ebcfc2 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -364,7 +364,42 @@ def INT_FENCE_SC_CLUSTER: NullaryInst<"fence.sc.cluster", int_nvvm_fence_sc_cluster>, Requires<[hasPTX<78>, hasSM<90>]>; +def INT_FENCE_MBARRIER_INIT_RELEASE_CLUSTER: + NullaryInst<"fence.mbarrier_init.release.cluster", + int_nvvm_fence_mbarrier_init_release_cluster>, + Requires<[hasPTX<80>, hasSM<90>]>; + +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_FENCE_ACQUIRE_SYNC_RESTRICT_CLUSTER_CLUSTER: + NullaryInst<"fence.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_FENCE_RELEASE_SYNC_RESTRICT_CTA_CLUSTER: + NullaryInst<"fence.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_release_sync_restrict_space_cta_scope_cluster>; +} + // Proxy fence (uni-directional) +let Predicates = [hasPTX<86>, hasSM<90>] in { +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_ACQUIRE_SYNC_RESTRICT_SPACE_CLUSTER_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster", + int_nvvm_fence_proxy_async_generic_acquire_sync_restrict_space_cluster_scope_cluster>; + +def INT_NVVM_FENCE_PROXY_ASYNC_GENERIC_RELEASE_SYNC_RESTRICT_SPACE_CTA_SCOPE_CLUSTER: + NullaryInst<"fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster", + int_nvvm_fence_proxy_async_generic_release_sync_restrict_space_cta_scope_cluster>; +} + +// Proxy fence (bi-directional) +foreach proxykind = ["alias", "async", "async.global", "async.shared_cta", + "async.shared_cluster"] in { + defvar Preds = !if(!eq(proxykind, "alias"), [hasPTX<75>, hasSM<70>], + [hasPTX<80>, hasSM<90>]); + defvar Intr = IntrinsicName<"llvm.nvvm.fence.proxy." # proxykind>; + def : NullaryInst<"fence.proxy." # !subst("_", "::", proxykind), + !cast(Intr.record_name)>, Requires; +} + class FENCE_PROXY_TENSORMAP_GENERIC_RELEASE : NullaryInst<"fence.proxy.tensormap::generic.release." # Scope, Intr>, Requires<[hasPTX<83>, hasSM<90>]>; diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll new file mode 100644 index 0000000000000..d46408e31752f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90-ptx86.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async_generic_acquire_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_acquire_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.acquire.sync_restrict.space.cluster.scope.cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_generic_release_sync_restrict() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_generic_release_sync_restrict( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async::generic.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async_generic.release.sync_restrict.space.cta.scope.cluster() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll new file mode 100644 index 0000000000000..896c624602a60 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-sm90.ll @@ -0,0 +1,51 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +define void @test_nvvm_fence_proxy_async() { +; CHECK-LABEL: test_nvvm_fence_proxy_async( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async() + ret void +} + +define void @test_nvvm_fence_proxy_async_global() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_global( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.global; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.global() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cluster() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cluster( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cluster() + ret void +} + +define void @test_nvvm_fence_proxy_async_shared_cta() { +; CHECK-LABEL: test_nvvm_fence_proxy_async_shared_cta( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.async.shared::cta; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.async.shared_cta() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll new file mode 100644 index 0000000000000..ab35e4fb396d6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy-tensormap-invalid.ll @@ -0,0 +1,8 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx83 -o /dev/null 2>&1 | FileCheck %s + +define void @test_fence_proxy_tensormap_generic_acquire(ptr addrspace(0) %addr) { + ; CHECK: immarg value 130 out of range [128, 129) + call void @llvm.nvvm.fence.proxy.tensormap_generic.acquire.cta(ptr addrspace(0) %addr, i32 130); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/fence-proxy.ll b/llvm/test/CodeGen/NVPTX/fence-proxy.ll new file mode 100644 index 0000000000000..cb5679e68944d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/fence-proxy.ll @@ -0,0 +1,15 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_70 && ptxas-isa-7.5 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx75 | %ptxas-verify -arch=sm_70 %} + +define void @test_nvvm_fence_proxy_alias() { +; CHECK-LABEL: test_nvvm_fence_proxy_alias( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.proxy.alias; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.proxy.alias() + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/op-fence.ll b/llvm/test/CodeGen/NVPTX/op-fence.ll new file mode 100644 index 0000000000000..629b702742afb --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/op-fence.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx80 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_mbarrier_init +define void @test_fence_mbarrier_init() { +; CHECK-LABEL: test_fence_mbarrier_init( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.mbarrier_init.release.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.mbarrier_init.release.cluster(); + + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/thread-fence.ll b/llvm/test/CodeGen/NVPTX/thread-fence.ll new file mode 100644 index 0000000000000..185461bd183d0 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/thread-fence.ll @@ -0,0 +1,31 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | FileCheck --check-prefixes=CHECK %s +; RUN: %if ptxas-sm_90 && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx86 | %ptxas-verify -arch=sm_90 %} + +; CHECK-LABEL: test_fence_acquire +define void @test_fence_acquire() { +; CHECK-LABEL: test_fence_acquire( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.acquire.sync_restrict::shared::cluster.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.acquire.sync_restrict.space.cluster.scope.cluster(); + + ret void +} + +; CHECK-LABEL: test_fence_release +define void @test_fence_release() { +; CHECK-LABEL: test_fence_release( +; CHECK: { +; CHECK-EMPTY: +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: fence.release.sync_restrict::shared::cta.cluster; +; CHECK-NEXT: ret; + call void @llvm.nvvm.fence.release.sync_restrict.space.cta.scope.cluster(); + + ret void +}