-
-
Couldn't load subscription status.
- Fork 5.7k
Description
This may be #15276, but it seems that issue has various gradations of challenge, so perhaps additional examples are useful.
This contains test code implementing the equivalent of map(s->1:s, size(A)) using 3 different strategies: one based on ntuple, several variants of map, and one that manually assembles the tuples. On my laptop, the variation in performance of these methods appears to be >5x; presumably, we'd prefer that they all be the same (and all good!). Naturally, the best of these is also the most laborious (the manual method), and it beats all the others by a factor of 2.
This is admittedly nitty-gritty microoptimization, but was necessary to get #16260 to pass nanosoldier; as I work on the cleanup (#16973), I thought I'd better take the time to document some of the challenges.
# Comparing 3 different implementations of indices
# indices1: calling "ntuple" (a local implementation you can play with)
@inline indices1{T,N}(A::AbstractArray{T,N}) = myntuple(d->1:size(A,d), Val{N})
# indices2: using "map" (a local implementation you can play with)
# Note this is largely academic, since you'd rather define indices2(A)
# in terms of indices2(A, d). Informative nonetheless.
@inline indices2a(A) = mymap_a(s->1:s, size(A))
@inline indices2b(A) = mymap_b(s->1:s, size(A))
@inline indices2c(A) = mymap_c(s->1:s, size(A))
# indices3: directly unrolling the function
@inline indices3a(A) = _indices3a((), A)
@inline _indices3a{T,N}(out::NTuple{N}, A::AbstractArray{T,N}) = out
@inline _indices3a(out, A) = _indices3a((out..., 1:size(A, length(out)+1)), A)
@inline indices3b(A) = _indices3b(size(A))
@inline _indices3b(sz::Tuple) = _indices3b(sz...)
@inline _indices3b() = ()
@inline _indices3b(s::Integer, sz...) = (1:s, _indices3b(sz...)...)
# Like NTuple without any checking
@inline myntuple{F,N}(f::F, v::Type{Val{N}}) = _myntuple((), f, v)
@inline _myntuple{F,N}(out::NTuple{N}, f::F, ::Type{Val{N}}) = out
@inline _myntuple{F,N}(out, f::F, v::Type{Val{N}}) = _myntuple((out..., f(length(out)+1)), f, v)
# Like map
@inline mymap_a(f, t) = (f(t[1]), mymap_a(f, Base.tail(t))...)
@inline mymap_a(f, ::Tuple{}) = ()
# Alternative map
@inline mymap_b(f, t) = (_mymap_b(f, t...)...,)
@inline _mymap_b(f) = ()
@inline _mymap_b(f, h, t...) = (f(h), _mymap_b(f, t...)...)
# Another alternative map
@inline mymap_c(f, t) = _mymap_c((), f, t...)
@inline _mymap_c(out, f) = out
@inline _mymap_c(out, f, h, t...) = _mymap_c((out..., f(h)), f, t...)
A = rand(2,3,4,5)
using BenchmarkTools
println("ntuple implementation")
@show @benchmark indices1($A)
println("map implementations")
@show @benchmark indices2a($A)
@show @benchmark indices2b($A)
@show @benchmark indices2c($A)
println("manually-inlined implementation")
@show @benchmark indices3a($A)
@show @benchmark indices3b($A)Results:
julia> include("/tmp/dialect.jl")
ntuple implementation
@benchmark(indices1($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 999
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 16.00 bytes
allocs estimate: 1
minimum time: 10.00 ns (0.00% GC)
median time: 11.00 ns (0.00% GC)
mean time: 11.82 ns (5.84% GC)
maximum time: 1.32 μs (97.95% GC)
map implementations
@benchmark(indices2a($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 1000
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 0.00 bytes
allocs estimate: 0
minimum time: 5.00 ns (0.00% GC)
median time: 5.00 ns (0.00% GC)
mean time: 5.01 ns (0.00% GC)
maximum time: 17.00 ns (0.00% GC)
@benchmark(indices2b($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 1000
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 0.00 bytes
allocs estimate: 0
minimum time: 4.00 ns (0.00% GC)
median time: 4.00 ns (0.00% GC)
mean time: 4.03 ns (0.00% GC)
maximum time: 17.00 ns (0.00% GC)
@benchmark(indices2c($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 1000
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 0.00 bytes
allocs estimate: 0
minimum time: 4.00 ns (0.00% GC)
median time: 4.00 ns (0.00% GC)
mean time: 4.05 ns (0.00% GC)
maximum time: 24.00 ns (0.00% GC)
manually-inlined implementation
@benchmark(indices3a($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 1000
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 0.00 bytes
allocs estimate: 0
minimum time: 2.00 ns (0.00% GC)
median time: 2.00 ns (0.00% GC)
mean time: 2.01 ns (0.00% GC)
maximum time: 13.00 ns (0.00% GC)
@benchmark(indices3b($(Expr(:$, :A)))) = BenchmarkTools.Trial:
samples: 10000
evals/sample: 1000
time tolerance: 5.00%
memory tolerance: 1.00%
memory estimate: 0.00 bytes
allocs estimate: 0
minimum time: 5.00 ns (0.00% GC)
median time: 5.00 ns (0.00% GC)
mean time: 5.02 ns (0.00% GC)
maximum time: 19.00 ns (0.00% GC)