Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 26 additions & 10 deletions src/KernelAbstractions.jl
Original file line number Diff line number Diff line change
Expand Up @@ -194,6 +194,10 @@ function unsafe_free! end

unsafe_free!(::AbstractArray) = return

include("intrinsics.jl")
import .KernelIntrinsics
export KernelIntrinsics

###
# Kernel language
# - @localmem
Expand Down Expand Up @@ -460,13 +464,27 @@ end
# Internal kernel functions
###

function __index_Local_Linear end
function __index_Group_Linear end
function __index_Global_Linear end
function __index_Local_Linear(ctx)
return KernelIntrinsics.get_local_id().x
end

function __index_Group_Linear(ctx)
return KernelIntrinsics.get_group_id().x
end

function __index_Global_Linear(ctx)
return KernelIntrinsics.get_global_id().x
end

function __index_Local_Cartesian end
function __index_Group_Cartesian end
function __index_Global_Cartesian end
function __index_Local_Cartesian(ctx)
return @inbounds workitems(__iterspace(ctx))[KernelIntrinsics.get_local_id().x]
end
function __index_Group_Cartesian(ctx)
return @inbounds blocks(__iterspace(ctx))[KernelIntrinsics.get_group_id().x]
end
function __index_Global_Cartesian(ctx)
return @inbounds expand(__iterspace(ctx), KernelIntrinsics.get_group_id().x, KernelIntrinsics.get_local_id().x)
end

@inline __index_Local_NTuple(ctx, I...) = Tuple(__index_Local_Cartesian(ctx, I...))
@inline __index_Group_NTuple(ctx, I...) = Tuple(__index_Group_Cartesian(ctx, I...))
Expand Down Expand Up @@ -796,11 +814,9 @@ include("macros.jl")
###

function Scratchpad end
function SharedMemory end
SharedMemory(t::Type{T}, dims::Val{Dims}, id::Val{Id}) where {T, Dims, Id} = KernelIntrinsics.localmemory(t, dims, id)

function __synchronize()
error("@synchronize used outside kernel or not captured")
end
__synchronize() = KernelIntrinsics.barrier()

@generated function __print(items...)
str = ""
Expand Down
60 changes: 60 additions & 0 deletions src/intrinsics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
module KernelIntrinsics

"""
get_global_size()::@NamedTuple{x::Int, y::Int, z::Int}

Return the number of global work-items specified.
"""
function get_global_size end

"""
get_global_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique global work-item ID.

!!! note
1-based.
"""
function get_global_id end

"""
get_local_size()::@NamedTuple{x::Int, y::Int, z::Int}

Return the number of local work-items specified.
"""
function get_local_size end

"""
get_local_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique local work-item ID.

!!! note
1-based.
"""
function get_local_id end

"""
get_num_groups()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the number of groups.
"""
function get_num_groups end

"""
get_group_id()::@NamedTuple{x::Int, y::Int, z::Int}

Returns the unique group ID.

!!! note
1-based.
"""
function get_group_id end

function localmemory end
function barrier()
error("Group barrier used outside kernel or not captured")
end
function print end
Comment on lines +54 to +58
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Needs docs.


end
29 changes: 15 additions & 14 deletions src/pocl/backend.jl
Original file line number Diff line number Diff line change
Expand Up @@ -140,29 +140,30 @@ end


## Indexing Functions
const KI = KA.KernelIntrinsics

@device_override @inline function KA.__index_Local_Linear(ctx)
return get_local_id(1)
@device_override @inline function KI.get_local_id()
return (; x = Int(get_local_id(1)), y = Int(get_local_id(2)), z = Int(get_local_id(3)))
end

@device_override @inline function KA.__index_Group_Linear(ctx)
return get_group_id(1)
@device_override @inline function KI.get_group_id()
return (; x = Int(get_group_id(1)), y = Int(get_group_id(2)), z = Int(get_group_id(3)))
end

@device_override @inline function KA.__index_Global_Linear(ctx)
return get_global_id(1)
@device_override @inline function KI.get_global_id()
return (; x = Int(get_global_id(1)), y = Int(get_global_id(2)), z = Int(get_global_id(3)))
end

@device_override @inline function KA.__index_Local_Cartesian(ctx)
@inbounds KA.workitems(KA.__iterspace(ctx))[get_local_id(1)]
@device_override @inline function KI.get_local_size()
return (; x = Int(get_local_size(1)), y = Int(get_local_size(2)), z = Int(get_local_size(3)))
end

@device_override @inline function KA.__index_Group_Cartesian(ctx)
@inbounds KA.blocks(KA.__iterspace(ctx))[get_group_id(1)]
@device_override @inline function KI.get_num_groups()
return (; x = Int(get_num_groups(1)), y = Int(get_num_groups(2)), z = Int(get_num_groups(3)))
end

@device_override @inline function KA.__index_Global_Cartesian(ctx)
return @inbounds KA.expand(KA.__iterspace(ctx), get_group_id(1), get_local_id(1))
@device_override @inline function KI.get_global_size()
return (; x = Int(get_global_size(1)), y = Int(get_global_size(2)), z = Int(get_global_size(3)))
end

@device_override @inline function KA.__validindex(ctx)
Expand All @@ -177,7 +178,7 @@ end

## Shared and Scratch Memory

@device_override @inline function KA.SharedMemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
@device_override @inline function KI.localmemory(::Type{T}, ::Val{Dims}, ::Val{Id}) where {T, Dims, Id}
ptr = POCL.emit_localmemory(T, Val(prod(Dims)))
CLDeviceArray(Dims, ptr)
end
Expand All @@ -189,7 +190,7 @@ end

## Synchronization and Printing

@device_override @inline function KA.__synchronize()
@device_override @inline function KI.barrier()
work_group_barrier(POCL.LOCAL_MEM_FENCE | POCL.GLOBAL_MEM_FENCE)
end

Expand Down
62 changes: 62 additions & 0 deletions test/intrinsics.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@

@kernel cpu = false inbounds = true unsafe_indices = true function test_intrinsics_kernel(results)
# Test all intrinsics return NamedTuples with x, y, z fields
global_size = KernelIntrinsics.get_global_size()
global_id = KernelIntrinsics.get_global_id()
local_size = KernelIntrinsics.get_local_size()
local_id = KernelIntrinsics.get_local_id()
num_groups = KernelIntrinsics.get_num_groups()
group_id = KernelIntrinsics.get_group_id()

if UInt32(global_id.x) <= UInt32(global_size.x)
results[1, global_id.x] = global_id.x
results[2, global_id.x] = local_id.x
results[3, global_id.x] = group_id.x
results[4, global_id.x] = global_size.x
results[5, global_id.x] = local_size.x
results[6, global_id.x] = num_groups.x
end
end


function intrinsics_testsuite(backend, AT)
@testset "KernelIntrinsics Tests" begin
@testset "Basic intrinsics functionality" begin

# Test with small kernel
N = 16
results = AT(zeros(Int, 6, N))

kernel = test_intrinsics_kernel(backend(), 4, (N,))
kernel(results, ndrange = N)
KernelAbstractions.synchronize(backend())

host_results = Array(results)

# Verify results make sense
for i in 1:N
global_id_x, local_id_x, group_id_x, global_size_x, local_size_x, num_groups_x = host_results[:, i]

# Global IDs should be 1-based and sequential
@test global_id_x == i

# Global size should match our ndrange
@test global_size_x == N

# Local size should be 4 (our workgroupsize)
@test local_size_x == 4

# Number of groups should be ceil(N/4) = 4
@test num_groups_x == 4

# Group ID should be 1-based
expected_group = div(i - 1, 4) + 1
@test group_id_x == expected_group

# Local ID should be 1-based within group
expected_local = ((i - 1) % 4) + 1
@test local_id_x == expected_local
end
end
end
end
5 changes: 5 additions & 0 deletions test/testsuite.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ end


include("test.jl")
include("intrinsics.jl")
include("localmem.jl")
include("private.jl")
include("unroll.jl")
Expand All @@ -47,6 +48,10 @@ function testsuite(backend, backend_str, backend_mod, AT, DAT; skip_tests = Set{
specialfunctions_testsuite(backend)
end

@conditional_testset "Intrinsics" skip_tests begin
intrinsics_testsuite(backend, AT)
end

@conditional_testset "Localmem" skip_tests begin
localmem_testsuite(backend, AT)
end
Expand Down
Loading