Skip to content

Port reverse from CUDA.jl #609

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 10 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
version = "11.2.3"

[deps]
AcceleratedKernels = "6a4ca0a5-0e36-4168-a932-d9be78d558f1"
Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
GPUToolbox = "096a3bc2-3ced-46d0-87f4-dd12716f4bfc"
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Expand All @@ -22,8 +24,10 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
JLD2Ext = "JLD2"

[compat]
AcceleratedKernels = "0.4"
Adapt = "4.0"
GPUArraysCore = "= 0.2.0"
GPUToolbox = "0.2, 0.3"
JLD2 = "0.4, 0.5"
KernelAbstractions = "0.9.28"
LLVM = "3.9, 4, 5, 6, 7, 8, 9"
Expand Down
3 changes: 3 additions & 0 deletions src/GPUArrays.jl
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
module GPUArrays

using GPUToolbox
using KernelAbstractions
using Serialization
using Random
Expand All @@ -16,6 +17,7 @@ using Reexport
@reexport using GPUArraysCore

using KernelAbstractions
import AcceleratedKernels as AK

# device functionality
include("device/abstractarray.jl")
Expand All @@ -26,6 +28,7 @@ include("host/construction.jl")
## integrations and specialized methods
include("host/base.jl")
include("host/indexing.jl")
include("host/reverse.jl")
include("host/broadcast.jl")
include("host/mapreduce.jl")
include("host/linalg.jl")
Expand Down
148 changes: 148 additions & 0 deletions src/host/reverse.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
# reversing

# the kernel works by treating the array as 1d. after reversing by dimension x an element at
# pos [i1, i2, i3, ... , i{x}, ..., i{n}] will be at
# pos [i1, i2, i3, ... , d{x} - i{x} + 1, ..., i{n}] where d{x} is the size of dimension x

# out-of-place version, copying a single value per thread from input to output
function _reverse(input::AnyGPUArray{T, N}, output::AnyGPUArray{T, N};
dims=1:ndims(input)) where {T, N}
@assert size(input) == size(output)
rev_dims = ntuple((d)-> d in dims && size(input, d) > 1, N)
ref = size(input) .+ 1
# converts an ND-index in the data array to the linear index
lin_idx = LinearIndices(input)
# converts a linear index in a reduced array to an ND-index, but using the reduced size
nd_idx = CartesianIndices(input)

## COV_EXCL_START
@kernel unsafe_indices=true function kernel(input, output)
offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
index_in = offset_in + @index(Local, Linear)

@inbounds if index_in <= length(input)
idx = Tuple(nd_idx[index_in])
idx = ifelse.(rev_dims, ref .- idx, idx)
index_out = lin_idx[idx...]
output[index_out] = input[index_in]
end
end
## COV_EXCL_STOP

nthreads = 256

kernel(get_backend(input))(input, output; workgroupsize=nthreads, ndrange=length(input))
end

# in-place version, swapping elements on half the number of threads
function _reverse!(data::AnyGPUArray{T, N}; dims=1:ndims(data)) where {T, N}
rev_dims = ntuple((d)-> d in dims && size(data, d) > 1, N)
half_dim = findlast(rev_dims)
if isnothing(half_dim)
# no reverse operation needed at all in this case.
return
end
ref = size(data) .+ 1
# converts an ND-index in the data array to the linear index
lin_idx = LinearIndices(data)
reduced_size = ntuple((d)->ifelse(d==half_dim, cld(size(data,d),2), size(data,d)), N)
reduced_length = prod(reduced_size)
# converts a linear index in a reduced array to an ND-index, but using the reduced size
nd_idx = CartesianIndices(reduced_size)

## COV_EXCL_START
@kernel unsafe_indices=true function kernel(data)
offset_in = Int32(@groupsize()[1]) * (@index(Group, Linear) - 1i32)
index_in = offset_in + @index(Local, Linear)

@inbounds if index_in <= reduced_length
idx = Tuple(nd_idx[index_in])
index_in = lin_idx[idx...]
idx = ifelse.(rev_dims, ref .- idx, idx)
index_out = lin_idx[idx...]

if index_in < index_out
temp = data[index_out]
data[index_out] = data[index_in]
data[index_in] = temp
end
end
end
## COV_EXCL_STOP

# NOTE: we launch slightly more than half the number of elements in the array as threads.
# The last non-singleton dimension along which to reverse is used to define how the array is split.
# Only the middle row in case of an odd array dimension could cause trouble, but this is prevented by
# ignoring the threads that cross the mid-point

nthreads = 256

kernel(get_backend(data))(data; workgroupsize=nthreads, ndrange=length(data))
end


# n-dimensional API

function Base.reverse!(data::AnyGPUArray{T, N}; dims=:) where {T, N}
if isa(dims, Colon)
dims = 1:ndims(data)
end
if !applicable(iterate, dims)
throw(ArgumentError("dimension $dims is not an iterable"))
end
if !all(1 .≤ dims .≤ ndims(data))
throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(data))"))
end

_reverse!(data; dims=dims)

return data
end

# out-of-place
function Base.reverse(input::AnyGPUArray{T, N}; dims=:) where {T, N}
if isa(dims, Colon)
dims = 1:ndims(input)
end
if !applicable(iterate, dims)
throw(ArgumentError("dimension $dims is not an iterable"))
end
if !all(1 .≤ dims .≤ ndims(input))
throw(ArgumentError("dimension $dims is not 1 ≤ $dims ≤ $(ndims(input))"))
end

if all(size(input)[[dims...]].==1)
# no reverse operation needed at all in this case.
return copy(input)
else
output = similar(input)
_reverse(input, output; dims=dims)
return output
end
end


# 1-dimensional API

# in-place
Base.@propagate_inbounds function Base.reverse!(data::AnyGPUVector{T}, start::Integer,
stop::Integer=length(data)) where {T}
_reverse!(view(data, start:stop))
return data
end

Base.reverse!(data::AnyGPUVector{T}) where {T} = @inbounds reverse!(data, 1, length(data))

# out-of-place
Base.@propagate_inbounds function Base.reverse(input::AnyGPUVector{T}, start::Integer,
stop::Integer=length(input)) where {T}
output = similar(input)

start > 1 && copyto!(output, 1, input, 1, start-1)
_reverse(view(input, start:stop), view(output, start:stop))
stop < length(input) && copyto!(output, stop+1, input, stop+1)

return output
end

Base.reverse(data::AnyGPUVector{T}) where {T} = @inbounds reverse(data, 1, length(data))
45 changes: 45 additions & 0 deletions test/testsuite/base.jl
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,51 @@ end
gA = reshape(AT(A),4)
end

@testset "reverse" begin
# 1-d out-of-place
@test compare(x->reverse(x), AT, rand(Float32, 1000))
@test compare(x->reverse(x, 10), AT, rand(Float32, 1000))
@test compare(x->reverse(x, 10, 90), AT, rand(Float32, 1000))

# 1-d in-place
@test compare(x->reverse!(x), AT, rand(Float32, 1000))
@test compare(x->reverse!(x, 10), AT, rand(Float32, 1000))
@test compare(x->reverse!(x, 10, 90), AT, rand(Float32, 1000))

# n-d out-of-place
for shape in ([1, 2, 4, 3], [4, 2], [5], [2^5, 2^5, 2^5]),
dim in 1:length(shape)
@test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))

cpu = rand(Float32, shape...)
gpu = AT(cpu)
reverse!(gpu; dims=dim)
@test Array(gpu) == reverse(cpu; dims=dim)
end

# supports multidimensional reverse
for shape in ([1, 2, 4, 3], [2^5, 2^5, 2^5]),
dim in ((1,2),(2,3),(1,3),:)
@test compare(x->reverse(x; dims=dim), AT, rand(Float32, shape...))

cpu = rand(Float32, shape...)
gpu = AT(cpu)
reverse!(gpu; dims=dim)
@test Array(gpu) == reverse(cpu; dims=dim)
end

# wrapped array
@test compare(x->reverse(x), AT, reshape(rand(Float32, 2,2), 4))

# error throwing
cpu = rand(Float32, 1,2,3,4)
gpu = AT(cpu)
@test_throws ArgumentError reverse!(gpu, dims=5)
@test_throws ArgumentError reverse!(gpu, dims=0)
@test_throws ArgumentError reverse(gpu, dims=5)
@test_throws ArgumentError reverse(gpu, dims=0)
end

@testset "reinterpret" begin
A = Int32[-1,-2,-3]
dA = AT(A)
Expand Down
Loading