Skip to content

@synchronize inside while loops doesn't work on CPUs #262

@tom91136

Description

@tom91136

I'm trying to port the classic GPU tree reduction via KernelAbstractions.jl.
See this for the direct CUDA implementation of what I'm trying to port from.
This is what I have implemented currently:

const TBSize = 1024::Int
const DotBlocks = 256::Int

@kernel function dot(@Const(a), @Const(b), size, partial)  
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0
  
    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end
  
    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end
  
    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

# driver
wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))

I was able to get correct results and performance seems mostly on par with our CUDA.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/CUDAStream.jl#L112) and AMDGPU.jl(https://github.com/UoB-HPC/BabelStream/blob/7c1e04a42b9b03b0e5c5d0b07c0ef9f4bdd59353/JuliaStream.jl/src/AMDGPUStream.jl#L135) implementation.
On CPU however, I got the following error:

Using device: AMD Ryzen 9 3900X 12-Core Processor (1T)
ERROR: LoadError: TaskFailedException
Stacktrace:
 [1] wait
   @ ./task.jl:322 [inlined]
 [2] wait
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:65 [inlined]
 [3] wait (repeats 2 times)
   @ ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:29 [inlined]
 [4] runDot(device::Tuple{UndefInitializer, String, Backend}, size::Int64, ainit::Float32, binit::Float32)
   @ Main ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:102
 [5] top-level scope
   @ ~/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

    nested task error: MethodError: no method matching isless(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
    Closest candidates are:
      isless(::AbstractVector{T} where T, ::AbstractVector{T} where T) at abstractarray.jl:1989
      isless(::Any, ::Missing) at missing.jl:88
      isless(::Missing, ::Any) at missing.jl:87
      ...
    Stacktrace:
      [1] call
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:456 [inlined]
      [2] fallback
        @ ~/.julia/packages/Cassette/N5kbV/src/context.jl:454 [inlined]
      [3] _overdub_fallback(::Any, ::Vararg{Any, N} where N)
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [4] overdub
        @ ~/.julia/packages/Cassette/N5kbV/src/overdub.jl:582 [inlined]
      [5] <(::Int64, ::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true})
        @ ./operators.jl:279 [inlined]
      [6] overdub
        @ ./operators.jl:279 [inlined]
      [7] >(::SubArray{Int64, 1, StaticArrays.MMatrix{1, 1024, Int64, 1024}, Tuple{Base.Slice{StaticArrays.SOneTo{1}}, Int64}, true}, ::Int64)
        @ ./operators.jl:305 [inlined]
      [8] overdub
        @ ./operators.jl:305 [inlined]
      [9] overdub
        @ ~/.julia/packages/KernelAbstractions/8W8KX/src/KernelAbstractions.jl:266 [inlined]
     [10] __thread_run(tid::Int64, len::Int64, rem::Int64, obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:157
     [11] __run(obj::KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, ndrange::Tuple{Int64}, iterspace::KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, args::Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, dynamic::KernelAbstractions.NDIteration.NoDynamicCheck)
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:130
     [12] (::KernelAbstractions.var"#33#34"{Nothing, Nothing, typeof(KernelAbstractions.__run), Tuple{KernelAbstractions.Kernel{CPU, KernelAbstractions.NDIteration.StaticSize{(1024,)}, KernelAbstractions.NDIteration.DynamicSize, var"#cpu_dot#5"{Float32}}, Tuple{Int64}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.StaticSize{(1024,)}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, Nothing}, Tuple{Vector{Float32}, Vector{Float32}, Int64, Vector{Float32}}, KernelAbstractions.NDIteration.NoDynamicCheck}})()
        @ KernelAbstractions ~/.julia/packages/KernelAbstractions/8W8KX/src/cpu.jl:22
in expression starting at /home/tom/babelstream-upstream/JuliaStream.jl/src/Test.jl:107

Removing the @synchronize macro in the while-loop makes the error go away but the answer becomes incorrect.
I've tried to do @print eltype(offset[1]), it prints the correct generic type (Float32 in the case) so I'm not sure what @synchronize is doing here.

For reference, here is what pkg status says:

  [21141c5a] AMDGPU v0.2.7
  [c7e460c6] ArgParse v1.1.4
  [052768ef] CUDA v3.2.1
  [72cfdca4] CUDAKernels v0.3.0
  [e2ba6199] ExprTools v0.1.3 ⚲
  [63c18a36] KernelAbstractions v0.7.0
  [d96e819e] Parameters v0.12.2
  [7eb9e9f0] ROCKernels v0.2.0
  [8ba89e20] Distributed

And the complete Test.jl reproducer:

using Base: Float32
using ROCKernels, CUDAKernels, KernelAbstractions, CUDA, AMDGPU

const TBSize = 1024::Int
const DotBlocks = 256::Int

@enum Backend cuda rocm cpu
const DeviceWithRepr = Tuple{Any,String,Backend}

function list_rocm_devices()::Vector{DeviceWithRepr}
  try
    # AMDGPU.agents()'s internal iteration order isn't stable
    sorted = sort(AMDGPU.get_agents(:gpu), by = repr)
    map(x -> (x, repr(x), rocm), sorted)
  catch
    # probably unsupported
    []
  end
end

function list_cuda_devices()::Vector{DeviceWithRepr}
  return !CUDA.functional(false) ? [] :
         map(d -> (d, "$(CUDA.name(d)) ($(repr(d)))", cuda), CUDA.devices())
end

function devices()::Vector{DeviceWithRepr}
  cudas = list_cuda_devices()
  rocms = list_rocm_devices()
  cpus = [(undef, "$(Sys.cpu_info()[1].model) ($(Threads.nthreads())T)", cpu)]
  vcat(cpus, cudas, rocms)
end


function runDot(device::DeviceWithRepr, size::Int, ainit::T, binit::T)::Tuple{T,T} where {T}
  (actual, name, backend) = device
  println("Using device: ", name)
  as = fill(ainit, size)
  bs = fill(binit, size)
  if backend == cpu
    partial_sum = Vector{T}(undef, DotBlocks)
    a = Vector{T}(as)
    b = Vector{T}(bs)
    backendDevice = CPU()
  elseif backend == cuda
    CUDA.device!(actual)
    partial_sum = CuArray{T}(undef, DotBlocks)
    a = CuArray{T}(as)
    b = CuArray{T}(bs)
    backendDevice = CUDADevice()
  elseif backend == rocm
    AMDGPU.DEFAULT_AGENT[] = actual
    partial_sum = ROCArray{T}(undef, DotBlocks)
    a = ROCArray{T}(as)
    b = ROCArray{T}(bs)
    backendDevice = ROCDevice()
  else
    error("unsupported backend $(backend)")
  end

  @kernel function dot(@Const(a), @Const(b), size, partial)
    local_i = @index(Local)
    group_i = @index(Group)
    tb_sum = @localmem T TBSize
    @inbounds tb_sum[local_i] = 0.0

    # do dot first
    i = @index(Global)
    while i <= size
      @inbounds tb_sum[local_i] += a[i] * b[i]
      i += TBSize * DotBlocks
    end

    # then tree reduction
    offset = @private Int64 (1,)
    @inbounds begin
      offset[1] = @groupsize()[1] ÷ 2
      while offset[1] > 0
        @synchronize
        if (local_i - 1) < offset[1]
          tb_sum[local_i] += tb_sum[local_i+offset[1]]
        end
        offset[1] ÷= 2
      end
    end

    if (local_i == 1)
      @inbounds partial[group_i] = tb_sum[local_i]
    end
  end

  function referenceDot()
    sum = zero(T)
    for i = 1:size
      @inbounds sum += a[i] * b[i]
    end
    return sum
  end

  wait(dot(backendDevice, TBSize)(a, b, size, partial_sum, ndrange = TBSize * DotBlocks))
  return (referenceDot(), sum(partial_sum))
end

device = devices()[1]
(expected, actual) = runDot(device, TBSize * 2, 1.0f0, 2.0f0)
println("actual=", actual, ", expected=", expected)

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions