From ad09dec86750558dc8c0a9c90296dfd3eb6ec183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20M=C3=BCller-Widmann?= Date: Sun, 19 Oct 2025 08:52:55 -0600 Subject: [PATCH] Fix implicit imports --- Project.toml | 4 +- docs/make.jl | 2 +- src/VectorizationBase.jl | 113 ++++------ src/base_defs.jl | 30 +-- src/cartesianvindex.jl | 2 +- src/early_definitions.jl | 2 +- src/lazymul.jl | 10 +- src/llvm_intrin/binary_ops.jl | 2 +- src/llvm_intrin/conversion.jl | 46 +++- src/llvm_intrin/intrin_funcs.jl | 22 +- src/llvm_intrin/masks.jl | 2 +- src/llvm_intrin/memory_addr.jl | 143 ++++++------- src/llvm_intrin/vector_ops.jl | 62 +++--- src/llvm_types.jl | 24 ++- src/promotion.jl | 4 +- src/ranges.jl | 24 +-- src/special/double.jl | 33 ++- src/special/exp.jl | 17 +- src/special/misc.jl | 4 +- src/static.jl | 4 +- src/strided_pointers/cse_stridemultiples.jl | 5 +- src/strided_pointers/stridedpointers.jl | 6 +- src/vecunroll/fmap.jl | 4 +- src/vecunroll/memory.jl | 53 +++-- test/Project.toml | 8 +- test/accuracy.jl | 4 +- test/runtests.jl | 220 +++++++++++++------- test/testsetup.jl | 4 +- 28 files changed, 419 insertions(+), 435 deletions(-) diff --git a/Project.toml b/Project.toml index 0a4cfd56..092a99a1 100644 --- a/Project.toml +++ b/Project.toml @@ -1,10 +1,11 @@ name = "VectorizationBase" uuid = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f" authors = ["Chris Elrod "] -version = "0.21.72" +version = "0.21.73" [deps] ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" +BitTwiddlingConvenienceFunctions = "62783981-4cbd-42fc-bca8-16325de8dc4b" CPUSummary = "2a0fbf3d-bb9c-48f3-b0a9-814d99fd7ab9" HostCPUFeatures = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0" IfElse = "615f187c-cbe4-4ef1-ba3b-2fcf58d6d173" @@ -17,6 +18,7 @@ StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718" [compat] ArrayInterface = "7" +BitTwiddlingConvenienceFunctions = "0.1.6" CPUSummary = "0.1.1 - 0.1.8, 0.1.11, 0.2" HostCPUFeatures = "0.1" IfElse = "0.1" diff --git a/docs/make.jl b/docs/make.jl index 15edee97..1d11f9eb 100644 --- a/docs/make.jl +++ b/docs/make.jl @@ -10,7 +10,7 @@ makedocs(; prettyurls = get(ENV, "CI", "false") == "true", canonical = "https://JuliaSIMD.github.io/VectorizationBase.jl" ), - pages = ["Home" => "index.md"], + pages = ["Home" => "index.md"] ) deploydocs(; repo = "github.com/JuliaSIMD/VectorizationBase.jl") diff --git a/src/VectorizationBase.jl b/src/VectorizationBase.jl index 0e0fe0e5..918979e8 100644 --- a/src/VectorizationBase.jl +++ b/src/VectorizationBase.jl @@ -6,61 +6,22 @@ end import StaticArrayInterface, LinearAlgebra, Libdl, IfElse, LayoutPointers const ArrayInterface = StaticArrayInterface using StaticArrayInterface: - contiguous_axis, - contiguous_axis_indicator, - contiguous_batch_size, - stride_rank, - device, - CPUPointer, - CPUIndex, - known_length, - known_first, - known_last, - static_size, - static_strides, - offsets, - static_first, - static_last, - static_length + stride_rank, static_strides, offsets, static_first, static_last, static_length import IfElse: ifelse -using CPUSummary: - cache_type, - num_cache, - num_cache_levels, - num_cores, - num_l1cache, - num_l2cache, - cache_associativity, - num_l3cache, - sys_threads, - cache_inclusive, - num_l4cache, - cache_linesize, - num_machines, - cache_size, - num_sockets +using CPUSummary: cache_linesize using HostCPUFeatures: register_size, static_sizeof, - fast_int64_to_double, pick_vector_width, - pick_vector_width_shift, - prevpow2, simd_integer_register_size, fma_fast, smax, - smin, has_feature, - has_opmask_registers, - register_count, - static_sizeof, cpu_name, register_size, - unwrap, - intlog2, - nextpow2, fast_half +using BitTwiddlingConvenienceFunctions: intlog2, nextpow2, prevpow2 import Base: Float16, @@ -74,7 +35,7 @@ import Base: UInt16, UInt32, UInt64, - Bool + Bool using SIMDTypes: Bit, @@ -82,28 +43,35 @@ using SIMDTypes: SignedHW, UnsignedHW, IntegerTypesHW, - NativeTypesExceptBitandFloat16, NativeTypesExceptBit, NativeTypesExceptFloat16, NativeTypes, _Vec using LayoutPointers: AbstractStridedPointer, - StridedPointer, StridedBitPointer, - memory_reference, stridedpointer, - zstridedpointer, similar_no_offset, similar_with_offset, - grouped_strided_pointer, - stridedpointers, bytestrides, - DensePointerWrapper, zero_offsets -using Static -using Static: One, Zero, eq, ne, lt, le, gt, ge +using Static: + Static, + One, + Zero, + eq, + lt, + le, + gt, + ge, + ne, + True, + False, + StaticBool, + StaticInt, + known, + static @inline function promote(x::X, y::Y) where {X,Y} T = promote_type(X, Y) @@ -363,7 +331,7 @@ function Base.show(io::IO, v::AbstractSIMDVector{W,T}) where {W,T} end print(io, ">") end -Base.bitstring(m::AbstractMask{W}) where {W} = bitstring(data(m))[end-W+1:end] +Base.bitstring(m::AbstractMask{W}) where {W} = bitstring(data(m))[(end-W+1):end] function Base.show(io::IO, m::AbstractMask{W}) where {W} bits = data(m) if m isa EVLMask @@ -371,7 +339,7 @@ function Base.show(io::IO, m::AbstractMask{W}) where {W} else print(io, "Mask{$W,Bit}<") end - for w ∈ 0:W-1 + for w ∈ 0:(W-1) print(io, (bits & 0x01) % Int) bits >>= 0x01 w < W - 1 && print(io, ", ") @@ -381,7 +349,7 @@ end function Base.show(io::IO, vu::VecUnroll{N,W,T,V}) where {N,W,T,V} println(io, "$(N+1) x $V") d = data(vu) - for n = 1:N+1 + for n = 1:(N+1) show(io, d[n]) n > N || println(io) end @@ -508,20 +476,23 @@ demoteint(::Type{Int64}, W::StaticInt) = gt(W, pick_vector_width(Int64)) end meta = Expr(:meta, :inline) if VERSION >= v"1.8.0-beta" - purity = Expr(:purity, - #= consistent =# true, - #= effect_free =# true, - #= nothrow =# true, - #= terminates_globally =# true, - #= terminates_locally =# false) + purity = Expr( + :purity, + #= consistent =#true, + #= effect_free =#true, + #= nothrow =#true, + #= terminates_globally =#true, + #= terminates_locally =#false + ) if VERSION >= v"1.11" - push!(purity.args, - #= notaskstate =# true, - #= inaccessiblememonly =# true, - #= noub =# true, - #= noub_if_noinbounds =# false, - #= consistent_overlay =# false, - #= nortcall =# true, + push!( + purity.args, + #= notaskstate =#true, + #= inaccessiblememonly =#true, + #= noub =#true, + #= noub_if_noinbounds =#false, + #= consistent_overlay =#false, + #= nortcall =#true ) end push!(meta.args, purity) @@ -537,9 +508,9 @@ function vec_quote(demote, W, Wpow2, offset::Int = 0) iszero(offset) && push!(call.args, :y) foreach( w -> push!(call.args, Expr(:call, getfield, :x, w, false)), - max(1, offset):min(W, Wpow2)-1 + max(1, offset):(min(W, Wpow2)-1) ) - foreach(w -> push!(call.args, Expr(:call, :zero, :T)), W+1:Wpow2) + foreach(w -> push!(call.args, Expr(:call, :zero, :T)), (W+1):Wpow2) call end @generated function _vec( @@ -578,7 +549,7 @@ else end end @inline reduce_to_onevec(f::F, vu::VecUnroll) where {F} = - ArrayInterface.reduce_tup(f, data(vu)) + Static.reduce_tup(f, data(vu)) if VERSION >= v"1.7.0" && hasfield(Method, :recursion_relation) dont_limit = Returns(true) diff --git a/src/base_defs.jl b/src/base_defs.jl index 66f1dffa..981b6f2b 100644 --- a/src/base_defs.jl +++ b/src/base_defs.jl @@ -1,27 +1,3 @@ -const FASTDICT = Dict{Symbol,Expr}([ - :(+) => :(Base.FastMath.add_fast), - :(-) => :(Base.FastMath.sub_fast), - :(*) => :(Base.FastMath.mul_fast), - :(/) => :(Base.FastMath.div_fast), - :(÷) => :(VectorizationBase.vdiv_fast), # VectorizationBase.vdiv == integer, VectorizationBase.vfdiv == float - :(%) => :(Base.FastMath.rem_fast), - :abs2 => :(Base.FastMath.abs2_fast), - :inv => :(Base.FastMath.inv_fast), # this is slower in most benchmarks - :hypot => :(Base.FastMath.hypot_fast), - :max => :(Base.FastMath.max_fast), - :min => :(Base.FastMath.min_fast), - :muladd => :(VectorizationBase.vmuladd_fast), - :fma => :(VectorizationBase.vfma_fast), - :vfmadd => :(VectorizationBase.vfmadd_fast), - :vfnmadd => :(VectorizationBase.vfnmadd_fast), - :vfmsub => :(VectorizationBase.vfmsub_fast), - :vfnmsub => :(VectorizationBase.vfnmsub_fast), - :log => :(SLEEFPirates.log_fast), - :log2 => :(SLEEFPirates.log2_fast), - :log10 => :(SLEEFPirates.log10_fast), - :(^) => :(Base.FastMath.pow_fast) -]) - for (op, f) ∈ [ (:(Base.:-), :vsub), (:(Base.FastMath.sub_fast), :vsub_fast), @@ -260,11 +236,7 @@ end x, y = promote(a, b) VecUnroll(fmap(ifelse, getfield(m, :data), unrolldata(x), unrolldata(y))) end -@inline function IfElse.ifelse( - m::VecUnroll{<:Any,<:Any,Bool}, - a::Real, - b::Real -) +@inline function IfElse.ifelse(m::VecUnroll{<:Any,<:Any,Bool}, a::Real, b::Real) x, y = promote(a, b) VecUnroll(fmap(ifelse, getfield(m, :data), unrolldata(x), unrolldata(y))) end diff --git a/src/cartesianvindex.jl b/src/cartesianvindex.jl index 7f7187fa..a3e400ad 100644 --- a/src/cartesianvindex.jl +++ b/src/cartesianvindex.jl @@ -8,7 +8,7 @@ struct CartesianVIndex{N,T<:Tuple{Vararg{Union{Int,StaticInt,NullStep},N}}} <: ) where {N,T<:Tuple{Vararg{Union{Int,StaticInt,NullStep},N}}} = new{N,T}(I) end Base.length(::CartesianVIndex{N}) where {N} = N -ArrayInterface.known_length(::Type{<:CartesianVIndex{N}}) where {N} = N +StaticArrayInterface.known_length(::Type{<:CartesianVIndex{N}}) where {N} = N Base.Tuple(i::CartesianVIndex) = getfield(i, :I) function Base.:(:)(I::CartesianVIndex{N}, J::CartesianVIndex{N}) where {N} CartesianIndices(map((i, j) -> i:j, getfield(I, :I), getfield(J, :I))) diff --git a/src/early_definitions.jl b/src/early_definitions.jl index 1965769a..eb2f1b23 100644 --- a/src/early_definitions.jl +++ b/src/early_definitions.jl @@ -56,7 +56,7 @@ end end @inline integer_preference(::StaticInt{B}) where {B} = - ifelse(ArrayInterface.ge(StaticInt{B}(), StaticInt{8}()), Int, Int32) + ifelse(ge(StaticInt{B}(), StaticInt{8}()), Int, Int32) @inline pick_integer(::Union{StaticInt{W},Val{W}}) where {W} = integer_preference(simd_integer_register_size() ÷ StaticInt{W}()) diff --git a/src/lazymul.jl b/src/lazymul.jl index 285a3245..85522381 100644 --- a/src/lazymul.jl +++ b/src/lazymul.jl @@ -432,8 +432,7 @@ end if iszero(r) quote $(Expr(:meta, :inline)) - p, - VectorizationBase.LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) + p, LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) end else quote @@ -451,8 +450,7 @@ end if iszero(r) quote $(Expr(:meta, :inline)) - p, - VectorizationBase.LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) + p, LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) end else quote @@ -479,7 +477,7 @@ end if iszero(r) quote $(Expr(:meta, :inline)) - VectorizationBase.LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) + LazyMulAdd{$N,$(I * M)}(MM{$W,$d}(getfield(b, :data))) end else quote @@ -509,7 +507,7 @@ end if iszero(r) quote $(Expr(:meta, :inline)) - VectorizationBase.LazyMulAdd{$N,$(I * M)}(-MM{$W,$d}(getfield(b, :data))) + LazyMulAdd{$N,$(I * M)}(-MM{$W,$d}(getfield(b, :data))) end else quote diff --git a/src/llvm_intrin/binary_ops.jl b/src/llvm_intrin/binary_ops.jl index fc7019fe..5a0763d0 100644 --- a/src/llvm_intrin/binary_ops.jl +++ b/src/llvm_intrin/binary_ops.jl @@ -144,7 +144,7 @@ end v2::AbstractSIMD{W,T} ) where {W,T<:FloatingTypes} = trunc(vfdiv_fast(v1, v2)) @inline vdiv_fast(v1::T, v2::T) where {T<:FloatingTypes} = - trunc(Base.FastMath.div_float_fast(v1, v2)) + trunc(Core.Intrinsics.div_float_fast(v1, v2)) @inline vdiv_fast(v1::T, v2::T) where {T<:Number} = v1 ÷ v2 @inline vdiv(v1::T, v2::T) where {T<:Number} = v1 ÷ v2 @inline vdiv(v1::T, v2::T) where {T<:FloatingTypes} = vdiv_fast(v1, v2) diff --git a/src/llvm_intrin/conversion.jl b/src/llvm_intrin/conversion.jl index f2d2a7d4..ec4ef10b 100644 --- a/src/llvm_intrin/conversion.jl +++ b/src/llvm_intrin/conversion.jl @@ -62,7 +62,7 @@ if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) ::False ) where {W,F} neg = v < 0 - pos = ifelse(neg, -v, v) + pos = ifelse(neg, -v, v) posf = _vconvert(Vec{W,F}, UInt64(pos), False()) ifelse(neg, -posf, posf) end @@ -85,7 +85,12 @@ if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) @inline function vconvert( ::Type{F}, v::VecUnroll{N,W,T,Vec{W,T}} - )::VecUnroll{N,W,F,Vec{W,F}} where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} + )::VecUnroll{ + N, + W, + F, + Vec{W,F} + } where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} _vconvert( Vec{W,F}, v, @@ -95,7 +100,12 @@ if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) @inline function vconvert( ::Type{Vec{W,F}}, v::VecUnroll{N,W,T,Vec{W,T}} - )::VecUnroll{N,W,F,Vec{W,F}} where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} + )::VecUnroll{ + N, + W, + F, + Vec{W,F} + } where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} _vconvert( Vec{W,F}, v, @@ -105,7 +115,12 @@ if (Sys.ARCH === :x86_64) || (Sys.ARCH === :i686) @inline function vconvert( ::Type{VecUnroll{N,W,F,Vec{W,F}}}, v::VecUnroll{N,W,T,Vec{W,T}} - )::VecUnroll{N,W,F,Vec{W,F}} where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} + )::VecUnroll{ + N, + W, + F, + Vec{W,F} + } where {N,W,F<:FloatingTypes,T<:Union{UInt64,Int64}} _vconvert( Vec{W,F}, v, @@ -165,13 +180,13 @@ end @inline vconvert(::Type{M}, v::Vec{W,Bool}) where {W,U,M<:AbstractMask{W,U}} = tomask(v) @inline vconvert( - ::Type{<:VectorizationBase.AbstractMask{W,U} where {U}}, + ::Type{<:AbstractMask{W,U} where {U}}, v::Vec{W,Bool} -) where {W} = VectorizationBase.tomask(v) +) where {W} = tomask(v) @inline vconvert( - ::Type{<:VectorizationBase.AbstractMask{L,U} where {L,U}}, + ::Type{<:AbstractMask{L,U} where {L,U}}, v::Vec{W,Bool} -) where {W} = VectorizationBase.tomask(v) +) where {W} = tomask(v) # @inline vconvert(::Type{Mask}, v::Vec{W,Bool}) where {W} = tomask(v) # @generated function vconvert(::Type{<:AbstractMask{W}}, v::Vec{W,Bool}) where {W} # instrs = String[] @@ -229,9 +244,18 @@ end ### `vconvert(::Type{<:NativeTypes}, x)` methods. These forward to `vconvert(::Type{Vec{W,T}}, x)` @inline vconvert(::Type{T}, s::T) where {T<:NativeTypes} = s @inline vconvert(::Type{T}, s::T) where {T<:IntegerTypesHW} = s -@inline vconvert(::Type{T}, s::Union{Float16,Float32,Float64}) where {T<:IntegerTypesHW} = Base.fptosi(T, Base.trunc_llvm(s)) -@inline vconvert(::Type{T}, s::IntegerTypesHW) where {T<:Union{Float16,Float32,Float64}} = convert(T, s)::T -@inline vconvert(::Type{T}, s::Union{Float16,Float32,Float64}) where {T<:Union{Float16,Float32,Float64}} = convert(T, s)::T +@inline vconvert( + ::Type{T}, + s::Union{Float16,Float32,Float64} +) where {T<:IntegerTypesHW} = Base.fptosi(T, Base.trunc_llvm(s)) +@inline vconvert( + ::Type{T}, + s::IntegerTypesHW +) where {T<:Union{Float16,Float32,Float64}} = convert(T, s)::T +@inline vconvert( + ::Type{T}, + s::Union{Float16,Float32,Float64} +) where {T<:Union{Float16,Float32,Float64}} = convert(T, s)::T @inline vconvert(::Type{T}, s::T) where {T<:Union{Float16,Float32,Float64}} = s @inline vconvert(::Type{T}, s::IntegerTypesHW) where {T<:IntegerTypesHW} = s % T @inline vconvert(::Type{T}, v::AbstractSIMD{W,T}) where {T<:NativeTypes,W} = v diff --git a/src/llvm_intrin/intrin_funcs.jl b/src/llvm_intrin/intrin_funcs.jl index 536b5b71..c7f79845 100644 --- a/src/llvm_intrin/intrin_funcs.jl +++ b/src/llvm_intrin/intrin_funcs.jl @@ -156,9 +156,10 @@ for (op, f) ∈ [ ("nearbyint", :vround)#,("roundeven",:roundeven) ] # @eval @generated Base.$f(v1::Vec{W,T}) where {W, T <: Union{Float32,Float64}} = llvmcall_expr($op, W, T, (W,), (T,), "nsz arcp contract afn reassoc") - @eval @generated $f(v1::Vec{W,T}) where {W,T<:Union{Float32,Float64}} = - (TS = T === Float32 ? :Float32 : :Float64; - build_llvmcall_expr($op, W, TS, [W], [TS], "fast")) + @eval @generated $f(v1::Vec{W,T}) where {W,T<:Union{Float32,Float64}} = ( + TS = T === Float32 ? :Float32 : :Float64; + build_llvmcall_expr($op, W, TS, [W], [TS], "fast") + ) end @inline vsqrt(v::AbstractSIMD{W,T}) where {W,T<:IntegerTypes} = vsqrt(float(v)) @inline vsqrt(v::FloatingTypes) = Base.sqrt_llvm_fast(v) @@ -459,16 +460,16 @@ end @inline vfma_fast(a::NativeTypes, b::NativeTypes, c::NativeTypes) = muladd(a, b, c) @inline vmuladd_fast(a::Float32, b::Float32, c::Float32) = - Base.FastMath.add_float_fast(Base.FastMath.mul_float_fast(a, b), c) + Core.Intrinsics.add_float_fast(Core.Intrinsics.mul_float_fast(a, b), c) @inline vmuladd_fast(a::Float64, b::Float64, c::Float64) = - Base.FastMath.add_float_fast(Base.FastMath.mul_float_fast(a, b), c) + Core.Intrinsics.add_float_fast(Core.Intrinsics.mul_float_fast(a, b), c) @inline vmuladd_fast(a::NativeTypes, b::NativeTypes, c::NativeTypes) = - Base.FastMath.add_fast(Base.FastMath.mul_fast(a, b), c) + Core.Intrinsics.add_fast(Core.Intrinsics.mul_fast(a, b), c) @inline vfma(a, b, c) = fma(a, b, c) @inline vmuladd(a, b, c) = muladd(a, b, c) @inline vfma_fast(a, b, c) = fma(a, b, c) @inline vmuladd_fast(a, b, c) = - Base.FastMath.add_fast(Base.FastMath.mul_fast(a, b), c) + Core.Intrinsics.add_fast(Core.Intrinsics.mul_fast(a, b), c) for f ∈ [:vfma, :vmuladd, :vfma_fast, :vmuladd_fast] @eval @inline function $f( v1::AbstractSIMD{W,T}, @@ -604,7 +605,7 @@ function collapse_mirror_expr(N, op, final) 2final end while N > _final - for n ∈ 1:N>>>1 + for n ∈ 1:(N>>>1) push!(q.args, Expr(:(=), cmp[n], Expr(:call, op, s[n], s[n+(N>>>1)]))) push!( q.args, @@ -623,7 +624,7 @@ function collapse_mirror_expr(N, op, final) N >>>= 1 end if final ≠ 1 - for n ∈ final+1:N + for n ∈ (final+1):N push!(q.args, Expr(:(=), cmp[n-final], Expr(:call, op, s[n-final], s[n]))) push!( q.args, @@ -741,8 +742,7 @@ for (op, f, S) ∈ [ end if Sys.ARCH == :aarch64 # TODO: maybe the default definition will stop segfaulting some day? for I ∈ (:Int64, :UInt64), (f, op) ∈ ((:vmaximum, :max), (:vminimum, :min)) - @eval @inline $f(v::Vec{W,$I}) where {W} = - ArrayInterface.reduce_tup($op, Tuple(v)) + @eval @inline $f(v::Vec{W,$I}) where {W} = Static.reduce_tup($op, Tuple(v)) end end diff --git a/src/llvm_intrin/masks.jl b/src/llvm_intrin/masks.jl index 46e3eed8..39463713 100644 --- a/src/llvm_intrin/masks.jl +++ b/src/llvm_intrin/masks.jl @@ -372,7 +372,7 @@ end @inline vzero(::EVLMask{W,U}) where {W,U} = EVLMask{W}(zero(U), 0x00000000) @inline Base.zero(::Type{M}) where {W,M<:AbstractMask{W}} = vzero(M) @inline zero_mask(::Union{Val{W},StaticInt{W}}) where {W} = - EVLMask{W}(zero(VectorizationBase.mask_type(Val{W}())), 0x00000000) + EVLMask{W}(zero(mask_type(Val{W}())), 0x00000000) @generated function max_mask(::Union{Val{W},StaticInt{W}}) where {W} U = mask_type(W) diff --git a/src/llvm_intrin/memory_addr.jl b/src/llvm_intrin/memory_addr.jl index 8042e9f4..93f5147f 100644 --- a/src/llvm_intrin/memory_addr.jl +++ b/src/llvm_intrin/memory_addr.jl @@ -168,16 +168,13 @@ function offset_ptr( # after this block, we will have a index_gep_typ pointer if iszero(O) @static if USE_OPAQUE_PTR - push!( - instrs, - "%ptr.$(i) = bitcast $(JULIAPOINTERTYPE) %0 to ptr" - ) + push!(instrs, "%ptr.$(i) = bitcast $(JULIAPOINTERTYPE) %0 to ptr") else push!( instrs, "%ptr.$(i) = inttoptr $(JULIAPOINTERTYPE) %0 to $(index_gep_typ)*" - ) - end + ) + end i += 1 else # !iszero(O) if !iszero(O & (tzf - 1)) # then index_gep_typ works for the constant offset @@ -188,15 +185,12 @@ function offset_ptr( offset = O >> tz end @static if USE_OPAQUE_PTR + push!(instrs, "%ptr.$(i) = bitcast $(JULIAPOINTERTYPE) %0 to ptr") + else push!( instrs, - "%ptr.$(i) = bitcast $(JULIAPOINTERTYPE) %0 to ptr" + "%ptr.$(i) = inttoptr $(JULIAPOINTERTYPE) %0 to $(offset_gep_typ)*" ) - else - push!( - instrs, - "%ptr.$(i) = inttoptr $(JULIAPOINTERTYPE) %0 to $(offset_gep_typ)*" - ) end i += 1 @static if USE_OPAQUE_PTR @@ -206,9 +200,9 @@ function offset_ptr( ) else push!( - instrs, - "%ptr.$(i) = getelementptr inbounds $(offset_gep_typ), $(offset_gep_typ)* %ptr.$(i-1), i32 $(offset)" - ) + instrs, + "%ptr.$(i) = getelementptr inbounds $(offset_gep_typ), $(offset_gep_typ)* %ptr.$(i-1), i32 $(offset)" + ) end i += 1 if forgep && iszero(M) && (iszero(X) || isone(X)) @@ -216,26 +210,23 @@ function offset_ptr( push!( instrs, "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to $(JULIAPOINTERTYPE)" - ) + ) else push!( - instrs, - "%ptr.$(i) = ptrtoint $(offset_gep_typ)* %ptr.$(i-1) to $(JULIAPOINTERTYPE)" + instrs, + "%ptr.$(i) = ptrtoint $(offset_gep_typ)* %ptr.$(i-1) to $(JULIAPOINTERTYPE)" ) - end + end i += 1 return instrs, i elseif offset_gep_typ != index_gep_typ @static if USE_OPAQUE_PTR + push!(instrs, "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr") + else push!( instrs, - "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr" + "%ptr.$(i) = bitcast $(offset_gep_typ)* %ptr.$(i-1) to $(index_gep_typ)*" ) - else - push!( - instrs, - "%ptr.$(i) = bitcast $(offset_gep_typ)* %ptr.$(i-1) to $(index_gep_typ)*" - ) end i += 1 end @@ -270,12 +261,12 @@ function offset_ptr( instrs, "%ptr.$(i) = bitcast <$W x ptr> %ptr.$(i-1) to <$W x $JULIAPOINTERTYPE>" ) - else + else push!( - instrs, - "%ptr.$(i) = ptrtoint <$W x $index_gep_typ*> %ptr.$(i-1) to <$W x $JULIAPOINTERTYPE>" - ) - end + instrs, + "%ptr.$(i) = ptrtoint <$W x $index_gep_typ*> %ptr.$(i-1) to <$W x $JULIAPOINTERTYPE>" + ) + end i += 1 elseif index_gep_typ != vtyp @static if USE_OPAQUE_PTR @@ -288,7 +279,7 @@ function offset_ptr( instrs, "%ptr.$(i) = bitcast <$W x $index_gep_typ*> %ptr.$(i-1) to <$W x $typ*>" ) - end + end i += 1 end return instrs, i @@ -353,18 +344,15 @@ function offset_ptr( # to avoid overflow vibytes = max(min(4, rs ÷ W), nextpow2(intlog2(X * W - 1) + 2) >> 3) vityp = "i$(8vibytes)" - vi = join((X * w for w ∈ 0:W-1), ", $vityp ") + vi = join((X * w for w ∈ 0:(W-1)), ", $vityp ") if typ !== index_gep_typ @static if USE_OPAQUE_PTR + push!(instrs, "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr") + else push!( instrs, - "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr" + "%ptr.$(i) = bitcast $(index_gep_typ)* %ptr.$(i-1) to $(typ)*" ) - else - push!( - instrs, - "%ptr.$(i) = bitcast $(index_gep_typ)* %ptr.$(i-1) to $(typ)*" - ) end i += 1 end @@ -375,8 +363,8 @@ function offset_ptr( ) else push!( - instrs, - "%ptr.$(i) = getelementptr inbounds $(typ), $(typ)* %ptr.$(i-1), <$W x $(vityp)> <$vityp $vi>" + instrs, + "%ptr.$(i) = getelementptr inbounds $(typ), $(typ)* %ptr.$(i-1), <$W x $(vityp)> <$vityp $vi>" ) end i += 1 @@ -388,8 +376,8 @@ function offset_ptr( ) else push!( - instrs, - "%ptr.$(i) = ptrtoint <$W x $typ*> %ptr.$(i-1) to <$W x $JULIAPOINTERTYPE>" + instrs, + "%ptr.$(i) = ptrtoint <$W x $typ*> %ptr.$(i-1) to <$W x $JULIAPOINTERTYPE>" ) end i += 1 @@ -398,11 +386,8 @@ function offset_ptr( end if forgep # if forgep, just return now @static if USE_OPAQUE_PTR - push!( - instrs, - "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to $JULIAPOINTERTYPE" - ) - else + push!(instrs, "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to $JULIAPOINTERTYPE") + else push!( instrs, "%ptr.$(i) = ptrtoint $(index_gep_typ)* %ptr.$(i-1) to $JULIAPOINTERTYPE" @@ -411,14 +396,11 @@ function offset_ptr( i += 1 elseif index_gep_typ != vtyp @static if USE_OPAQUE_PTR - push!( - instrs, - "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr" - ) + push!(instrs, "%ptr.$(i) = bitcast ptr %ptr.$(i-1) to ptr") else push!( - instrs, - "%ptr.$(i) = bitcast $(index_gep_typ)* %ptr.$(i-1) to $(vtyp)*" + instrs, + "%ptr.$(i) = bitcast $(index_gep_typ)* %ptr.$(i-1) to $(vtyp)*" ) end i += 1 @@ -566,7 +548,7 @@ end ) where {N,C,B,R} stridedpointer( pointer(ptr), - ArrayInterface.StrideIndex{N,R,C}(static_strides(ptr), offs), + StaticArrayInterface.StrideIndex{N,R,C}(static_strides(ptr), offs), StaticInt{B}() ) end @@ -583,7 +565,7 @@ end ) where {N,C,B,R,K} stridedpointer( pointer(ptr), - ArrayInterface.StrideIndex{N,R,C}( + StaticArrayInterface.StrideIndex{N,R,C}( static_strides(ptr), increment_ptr(ptr, i) ), @@ -917,12 +899,11 @@ function vload_quote_llvmcall_core( end if grv @static if USE_OPAQUE_PTR - loadinstr = - "$vtyp @llvm.masked.gather." * - suffix(W, T_sym) + loadinstr = "$vtyp @llvm.masked.gather." * suffix(W, T_sym) decl *= "declare $loadinstr(<$W x ptr>, i32, <$W x i1>, $vtyp)" else - loadinstr = "$vtyp @llvm.masked.gather." * + loadinstr = + "$vtyp @llvm.masked.gather." * suffix(W, T_sym) * '.' * ptr_suffix(W, T_sym) @@ -939,9 +920,9 @@ function vload_quote_llvmcall_core( ) else push!( - instrs, - "%res = call $loadinstr(<$W x $typ*> %ptr.$(i-1), i32 $alignment, <$W x i1> $m, $vtyp $passthrough)" * - LOAD_SCOPE_TBAA_FLAGS + instrs, + "%res = call $loadinstr(<$W x $typ*> %ptr.$(i-1), i32 $alignment, <$W x i1> $m, $vtyp $passthrough)" * + LOAD_SCOPE_TBAA_FLAGS ) end elseif mask @@ -972,9 +953,9 @@ function vload_quote_llvmcall_core( ) else push!( - instrs, - "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" * - LOAD_SCOPE_TBAA_FLAGS + instrs, + "%res = load $vtyp, $vtyp* %ptr.$(i-1), align $alignment" * + LOAD_SCOPE_TBAA_FLAGS ) end end @@ -1381,10 +1362,8 @@ function vstore_quote( argtostore = "%1" end if grv - @static if USE_OPAQUE_PTR - storeinstr = - "void @llvm.masked.scatter." * - suffix(W, T_sym) + @static if USE_OPAQUE_PTR + storeinstr = "void @llvm.masked.scatter." * suffix(W, T_sym) decl *= "declare $storeinstr($vtyp, <$W x ptr>, i32, <$W x i1>)" else storeinstr = @@ -1394,7 +1373,7 @@ function vstore_quote( ptr_suffix(W, T_sym) decl *= "declare $storeinstr($vtyp, <$W x $typ*>, i32, <$W x i1>)" end - m = mask ? m = "%mask.0" : llvmconst(W, "i1 1") + m = mask ? m = "%mask.0" : llvmconst(W, "i1 1") @static if USE_OPAQUE_PTR push!( instrs, @@ -1403,9 +1382,9 @@ function vstore_quote( ) else push!( - instrs, - "call $storeinstr($vtyp $(argtostore), <$W x $typ*> %ptr.$(i-1), i32 $alignment, <$W x i1> $m)" * - metadata + instrs, + "call $storeinstr($vtyp $(argtostore), <$W x $typ*> %ptr.$(i-1), i32 $alignment, <$W x i1> $m)" * + metadata ) end # push!(instrs, "call $storeinstr($vtyp $(argtostore), <$W x $typ*> %ptr.$(i-1), i32 $alignment, <$W x i1> $m)") @@ -1422,7 +1401,7 @@ function vstore_quote( else storeinstr = "void @llvm.masked.store." * suff decl *= "declare $storeinstr($vtyp, $vtyp*, i32, <$W x i1>)" - push!( + push!( instrs, "call $storeinstr($vtyp $(argtostore), $vtyp* %ptr.$(i-1), i32 $alignment, <$W x i1> %mask.0)" * metadata @@ -2357,7 +2336,7 @@ end ret void """ end - + llvmcall_expr( decl, instrs, @@ -2433,7 +2412,7 @@ end ) end @generated function lifetime_end!(ptr::Ptr{T}, ::Val{L}) where {L,T} - @static if USE_OPAQUE_PTR + @static if USE_OPAQUE_PTR decl = "declare void @llvm.lifetime.end(i64, ptr nocapture)" instrs = """ call void @llvm.lifetime.end(i64 $L, ptr %0) @@ -2601,13 +2580,11 @@ end Expr( :block, Expr(:meta, :inline), - :( - unsafe_store!( - Base.unsafe_convert(Ptr{Ptr{Cvoid}}, p) + convert(Int, i), - Base.pointer_from_objref(v) - ); - return nothing - ) + :(unsafe_store!( + Base.unsafe_convert(Ptr{Ptr{Cvoid}}, p) + convert(Int, i), + Base.pointer_from_objref(v) + ); + return nothing) ) end end diff --git a/src/llvm_intrin/vector_ops.jl b/src/llvm_intrin/vector_ops.jl index 154ef5b9..d359fdd7 100644 --- a/src/llvm_intrin/vector_ops.jl +++ b/src/llvm_intrin/vector_ops.jl @@ -17,11 +17,10 @@ function shufflevector_instrs( mask::String = '<' * join(I, ", ")::String * '>' if ((W2 == 0) | (W2 == W)) v2 = W2 == 0 ? "undef" : "%1" - M, + M, """ + %res = shufflevector $vtyp1 %0, $vtyp1 $v2, $vtyp3 $mask + ret $vtypr %res """ - %res = shufflevector $vtyp1 %0, $vtyp1 $v2, $vtyp3 $mask - ret $vtypr %res - """ else vtyp0 = "<$W2 x $typ>" maskpad = @@ -31,12 +30,11 @@ function shufflevector_instrs( ", " ) * '>' - M, + M, """ + %pad = shufflevector $vtyp0 %1, $vtyp0 undef, <$W x i32> $maskpad + %res = shufflevector $vtyp1 %0, $vtyp1 %pad, $vtyp3 $mask + ret $vtypr %res """ - %pad = shufflevector $vtyp0 %1, $vtyp0 undef, <$W x i32> $maskpad - %res = shufflevector $vtyp1 %0, $vtyp1 %pad, $vtyp3 $mask - ret $vtypr %res - """ end end function tupletostringvector(@nospecialize(x::NTuple{N,Int})) where {N} @@ -94,7 +92,7 @@ end typ = LLVM_TYPES[T] mask = '<' * - join(map(x -> string("i32 ", x ≥ L ? "undef" : string(x)), 0:W-1), ", ") * + join(map(x -> string("i32 ", x ≥ L ? "undef" : string(x)), 0:(W-1)), ", ") * '>' instrs = """ %res = shufflevector <$L x $typ> %0, <$L x $typ> undef, <$W x i32> $mask @@ -144,10 +142,10 @@ end ) ) mask = Vector{String}(undef, 2W1) - for w ∈ 0:W1+W2-1 + for w ∈ 0:(W1+W2-1) mask[w+1] = string("i32 ", w) end - for w ∈ W1+W2:2W1-1 + for w ∈ (W1+W2):(2W1-1) mask[w+1] = "i32 undef" end M, instrs = shufflevector_instrs(W1, T, mask, W2) @@ -194,8 +192,8 @@ function transpose_vecunroll_quote(W) log2W = intlog2(W) q = Expr(:block, Expr(:meta, :inline), :(vud = data(vu))) N = W # N vectors of length W - vectors1 = [Symbol(:v_, n) for n ∈ 0:N-1] - vectors2 = [Symbol(:v_, n + N) for n ∈ 0:N-1] + vectors1 = [Symbol(:v_, n) for n ∈ 0:(N-1)] + vectors2 = [Symbol(:v_, n + N) for n ∈ 0:(N-1)] # z = Expr(:call, Expr(:curly, Expr(:(.), :VectorizationBase, QuoteNode(:MM)), W), 0) # for n ∈ 1:N # push!(q.args, Expr(:(=), vectors1[n], Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(:vload)), :ptrA, Expr(:tuple, z, n-1)))) @@ -209,10 +207,10 @@ function transpose_vecunroll_quote(W) Nhalf = N >>> 1 vecstride = 1 partition_stride = 2 - for nsplits = 0:log2W-1 + for nsplits = 0:(log2W-1) shuffle0 = transposeshuffle(nsplits, W, false) shuffle1 = transposeshuffle(nsplits, W, true) - for partition ∈ 0:(W>>>(nsplits+1))-1 + for partition ∈ 0:((W>>>(nsplits+1))-1) for _n1 ∈ 1:vecstride n1 = partition * partition_stride + _n1 n2 = n1 + vecstride @@ -243,7 +241,7 @@ function transpose_vecunroll_quote(W) end function subset_tup(W, o) t = Expr(:tuple) - for w ∈ o:W-1+o + for w ∈ o:(W-1+o) push!(t.args, w) end Expr(:call, Expr(:curly, :Val, t)) @@ -258,8 +256,8 @@ function transpose_vecunroll_quote_W_larger(N, W) log2N = intlog2(N) q = Expr(:block, Expr(:meta, :inline), :(vud = data(vu))) # N = W # N vectors of length W - vectors1 = [Symbol(:v_, n) for n ∈ 0:N-1] - vectors2 = [Symbol(:v_, n + N) for n ∈ 0:N-1] + vectors1 = [Symbol(:v_, n) for n ∈ 0:(N-1)] + vectors2 = [Symbol(:v_, n + N) for n ∈ 0:(N-1)] # z = Expr(:call, Expr(:curly, Expr(:(.), :VectorizationBase, QuoteNode(:MM)), W), 0) # for n ∈ 1:N # push!(q.args, Expr(:(=), vectors1[n], Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(:vload)), :ptrA, Expr(:tuple, z, n-1)))) @@ -273,10 +271,10 @@ function transpose_vecunroll_quote_W_larger(N, W) Nhalf = N >>> 1 vecstride = 1 partition_stride = 2 - for nsplits = 0:log2N-1 + for nsplits = 0:(log2N-1) shuffle0 = transposeshuffle(nsplits, W, false) shuffle1 = transposeshuffle(nsplits, W, true) - for partition ∈ 0:(N>>>(nsplits+1))-1 + for partition ∈ 0:((N>>>(nsplits+1))-1) for _n1 ∈ 1:vecstride n1 = partition * partition_stride + _n1 n2 = n1 + vecstride @@ -322,13 +320,13 @@ function transpose_vecunroll_quote_W_smaller(N, W) log2N = intlog2(N) q = Expr(:block, Expr(:meta, :inline), :(vud = data(vu))) # N = W # N vectors of length W - vectors1 = [Symbol(:v_, n) for n ∈ 0:N-1] - vectors2 = [Symbol(:v_, n + N) for n ∈ 0:N-1] + vectors1 = [Symbol(:v_, n) for n ∈ 0:(N-1)] + vectors2 = [Symbol(:v_, n + N) for n ∈ 0:(N-1)] # z = Expr(:call, Expr(:curly, Expr(:(.), :VectorizationBase, QuoteNode(:MM)), W), 0) # for n ∈ 1:N # push!(q.args, Expr(:(=), vectors1[n], Expr(:call, Expr(:(.), :VectorizationBase, QuoteNode(:vload)), :ptrA, Expr(:tuple, z, n-1)))) # end - vectors3 = [Symbol(:vpiece_, w) for w ∈ 0:W-1] + vectors3 = [Symbol(:vpiece_, w) for w ∈ 0:(W-1)] for w ∈ 1:W push!( q.args, @@ -345,7 +343,7 @@ function transpose_vecunroll_quote_W_smaller(N, W) Wratio = Wratio_init while Wratio > 1 Wratioh = Wratio >>> 1 - for w ∈ 0:(Wratioh)-1 + for w ∈ 0:((Wratioh)-1) i = (2N) * w j = i + N for n ∈ 1:N @@ -364,10 +362,10 @@ function transpose_vecunroll_quote_W_smaller(N, W) Nhalf = N >>> 1 vecstride = 1 partition_stride = 2 - for nsplits = 0:log2N-1 + for nsplits = 0:(log2N-1) shuffle0 = transposeshuffle(nsplits, W, false) shuffle1 = transposeshuffle(nsplits, W, true) - for partition ∈ 0:(N>>>(nsplits+1))-1 + for partition ∈ 0:((N>>>(nsplits+1))-1) for _n1 ∈ 1:vecstride n1 = partition * partition_stride + _n1 n2 = n1 + vecstride @@ -445,7 +443,7 @@ end end @generated function vec_to_vecunroll(v::AbstractSIMDVector{W}) where {W} t = Expr(:tuple) - for w ∈ 0:W-1 + for w ∈ 0:(W-1) push!(t.args, :(extractelement(v, $w))) end Expr(:block, Expr(:meta, :inline), :(VecUnroll($t))) @@ -498,14 +496,14 @@ end @generated function uppervector(vx::AbstractSIMD{W}) where {W} s = Expr(:tuple) - for i ∈ W>>>1:W-1 + for i ∈ (W>>>1):(W-1) push!(s.args, i) end shuffleexpr(s) end @generated function lowervector(vx::AbstractSIMD{W}) where {W} s = Expr(:tuple) - for i ∈ 0:(W>>>1)-1 + for i ∈ 0:((W>>>1)-1) push!(s.args, i) end shuffleexpr(s) @@ -514,14 +512,14 @@ end @generated function extractupper(vx::AbstractSIMD{W}) where {W} s = Expr(:tuple) - for i ∈ 0:(W>>>1)-1 + for i ∈ 0:((W>>>1)-1) push!(s.args, 2i) end shuffleexpr(s) end @generated function extractlower(vx::AbstractSIMD{W}) where {W} s = Expr(:tuple) - for i ∈ 0:(W>>>1)-1 + for i ∈ 0:((W>>>1)-1) push!(s.args, 2i + 1) end shuffleexpr(s) diff --git a/src/llvm_types.jl b/src/llvm_types.jl index 60409cda..9b85e41d 100644 --- a/src/llvm_types.jl +++ b/src/llvm_types.jl @@ -117,14 +117,15 @@ end """ use opaque pointer Ref: -- Switch LLVM codegen of Ptr{T} to an actual pointer type. - https://github.com/JuliaLang/julia/pull/53687 + + - Switch LLVM codegen of Ptr{T} to an actual pointer type. + https://github.com/JuliaLang/julia/pull/53687 """ const USE_OPAQUE_PTR = VERSION >= v"1.12-DEV" @static if !USE_OPAQUE_PTR const JULIAPOINTERTYPE = 'i' * string(8sizeof(Int)) -else +else const JULIAPOINTERTYPE = "ptr" end @@ -160,10 +161,10 @@ suffix(@nospecialize(T))::String = suffix(JULIA_TYPES[T]) @static if !USE_OPAQUE_PTR ptr_suffix(T) = "p0" * suffix(T) suffix(::Type{Ptr{T}}) where {T} = "p0" * suffix(T) -else +else ptr_suffix(T) = "p0" suffix(::Type{Ptr{T}}) where {T} = "p0" -end +end suffix(W::Int, T) = suffix(W, suffix(T)) # Type-dependent LLVM constants @@ -298,12 +299,13 @@ end Expr(:purity, true, true, true, true, false) end VERSION >= v"1.9.0-DEV.1019" && push!(purity.args, true) - VERSION >= v"1.11" && push!(purity.args, - #= inaccessiblememonly =# true, - #= noub =# true, - #= noub_if_noinbounds =# false, - #= consistent_overlay =# false, - #= nortcall =# true, + VERSION >= v"1.11" && push!( + purity.args, + #= inaccessiblememonly =#true, + #= noub =#true, + #= noub_if_noinbounds =#false, + #= consistent_overlay =#false, + #= nortcall =#true ) Expr(:meta, purity, :inline) else diff --git a/src/promotion.jl b/src/promotion.jl index c5e2f5fe..1795f60f 100644 --- a/src/promotion.jl +++ b/src/promotion.jl @@ -227,7 +227,7 @@ maybethrow(::False) = nothing ::Type{V2} ) where {Nm1,Wsplit,T,V1,T2,W,V2<:AbstractSIMDVector{W,T2}} maybethrow( - ArrayInterface.ne( + ne( StaticInt{Nm1}() * StaticInt{Wsplit}() + StaticInt{Wsplit}(), StaticInt{W}() ) @@ -240,7 +240,7 @@ end ::Type{V2} ) where {Nm1,Wsplit,T,V1,W,V2<:AbstractMask{W}} maybethrow( - ArrayInterface.ne( + ne( StaticInt{Nm1}() * StaticInt{Wsplit}() + StaticInt{Wsplit}(), StaticInt{W}() ) diff --git a/src/ranges.jl b/src/ranges.jl index e286be75..d5551c63 100644 --- a/src/ranges.jl +++ b/src/ranges.jl @@ -7,7 +7,7 @@ t = Expr(:tuple) foreach( w -> push!(t.args, Expr(:call, :(Core.VecElement), T(F * w + O))), - 0:W-1 + 0:(W-1) ) Expr(:block, Expr(:meta, :inline), Expr(:call, :Vec, t)) end @@ -48,7 +48,7 @@ F - static multiplicative factor iexpr = bytes == sizeof(I) ? :i : Expr(:call, :%, :i, jtypesym) typ = "i$(bits)" vtyp = vtype(W, typ) - rangevec = join(("$typ $(F*w + O)" for w ∈ 0:W-1), ", ") + rangevec = join(("$typ $(F*w + O)" for w ∈ 0:(W-1)), ", ") instrs = """ %ie = insertelement $vtyp undef, $typ %0, i32 0 %v = shufflevector $vtyp %ie, $vtyp undef, <$W x i32> zeroinitializer @@ -81,7 +81,7 @@ end ) typ = LLVM_TYPES[T] vtyp = vtype(W, typ) - rangevec = join(("$typ $(F*w+O).0" for w ∈ 0:W-1), ", ") + rangevec = join(("$typ $(F*w+O).0" for w ∈ 0:(W-1)), ", ") instrs = """ %ie = insertelement $vtyp undef, $typ %0, i32 0 %v = shufflevector $vtyp %ie, $vtyp undef, <$W x i32> zeroinitializer @@ -207,18 +207,12 @@ end @inline vfdiv_fast(i::MM, j::T) where {T<:Real} = vfdiv_fast(float(i), j) @inline vfdiv_fast(j::T, i::MM) where {T<:Real} = vfdiv_fast(j, float(i)) -@inline vfdiv(x::AbstractSIMDVector{W}, y::VectorizationBase.MM{W}) where {W} = - x / float(y) -@inline vfdiv(y::VectorizationBase.MM{W}, x::AbstractSIMDVector{W}) where {W} = - float(y) / x -@inline vfdiv_fast( - x::AbstractSIMDVector{W}, - y::VectorizationBase.MM{W} -) where {W} = vfiv_fast(x, float(y)) -@inline vfdiv_fast( - y::VectorizationBase.MM{W}, - x::AbstractSIMDVector{W} -) where {W} = vfdiv_fast(float(y), x) +@inline vfdiv(x::AbstractSIMDVector{W}, y::MM{W}) where {W} = x / float(y) +@inline vfdiv(y::MM{W}, x::AbstractSIMDVector{W}) where {W} = float(y) / x +@inline vfdiv_fast(x::AbstractSIMDVector{W}, y::MM{W}) where {W} = + vfiv_fast(x, float(y)) +@inline vfdiv_fast(y::MM{W}, x::AbstractSIMDVector{W}) where {W} = + vfdiv_fast(float(y), x) @inline vfdiv(i::MM, j::VecUnroll{N,W,T,V}) where {N,W,T,V} = float(i) / j @inline vfdiv(j::VecUnroll{N,W,T,V}, i::MM) where {N,W,T,V} = j / float(i) diff --git a/src/special/double.jl b/src/special/double.jl index 019e758c..0c37e094 100644 --- a/src/special/double.jl +++ b/src/special/double.jl @@ -27,7 +27,7 @@ # - [SLEEF](https://github.com/shibatch/SLEEF) [public domain] Author Naoki Shibata -using Base.Math: IEEEFloat +using Base: IEEEFloat for (op, f, ff) ∈ [ ("fadd", :add_ieee, :(+)), ("fsub", :sub_ieee, :(-)), @@ -39,17 +39,11 @@ for (op, f, ff) ∈ [ @generated $f( v1::Vec{W,T}, v2::Vec{W,T} - ) where {W,T<:Union{Float32,Float64}} = - VectorizationBase.binary_op($op, W, T) + ) where {W,T<:Union{Float32,Float64}} = binary_op($op, W, T) @inline $f(s1::T, s2::T) where {T<:Union{Float32,Float64}} = $ff(s1, s2) @inline $f(args::Vararg{Any,K}) where {K} = $f(promote(args...)...) - @inline $f(a::VecUnroll, b::VecUnroll) = VecUnroll( - VectorizationBase.fmap( - $f, - VectorizationBase.data(a), - VectorizationBase.data(b) - ) - ) + @inline $f(a::VecUnroll, b::VecUnroll) = + VecUnroll(fmap($f, data(a), data(b))) end end @inline add_ieee(a, b, c) = add_ieee(add_ieee(a, b), c) @@ -62,15 +56,15 @@ function sub_ieee!(ex) if _f isa Symbol f::Symbol = _f if f === :(+) - ex.args[1] = :(VectorizationBase.add_ieee) + ex.args[1] = :($(VectorizationBase).add_ieee) elseif f === :(-) - ex.args[1] = :(VectorizationBase.sub_ieee) + ex.args[1] = :($(VectorizationBase).sub_ieee) elseif f === :(*) - ex.args[1] = :(VectorizationBase.mul_ieee) + ex.args[1] = :($(VectorizationBase).mul_ieee) elseif f === :(/) - ex.args[1] = :(VectorizationBase.fdiv_ieee) + ex.args[1] = :($(VectorizationBase).fdiv_ieee) elseif f === :(%) - ex.args[1] = :(VectorizationBase.rem_ieee) + ex.args[1] = :($(VectorizationBase).rem_ieee) end end end @@ -81,11 +75,8 @@ macro ieee(ex) sub_ieee!(ex) end -const vIEEEFloat = Union{ - IEEEFloat, - Vec{<:Any,<:IEEEFloat}, - VectorizationBase.VecUnroll{<:Any,<:Any,<:IEEEFloat} -} +const vIEEEFloat = + Union{IEEEFloat,Vec{<:Any,<:IEEEFloat},VecUnroll{<:Any,<:Any,<:IEEEFloat}} struct Double{T<:vIEEEFloat} <: Number hi::T @@ -143,7 +134,7 @@ Base.issubnormal(d::Double) = issubnormal(d.hi) | issubnormal(d.lo) th = Expr(:tuple) tl = Expr(:tuple) gf = GlobalRef(Core, :getfield) - for n ∈ 1:N+1 + for n ∈ 1:(N+1) ifelseₕ = Expr(:call, :ifelse, Expr(:call, gf, :md, n, false)) ifelseₗ = Expr(:call, :ifelse, Expr(:call, gf, :md, n, false)) if V1 <: VecUnroll diff --git a/src/special/exp.jl b/src/special/exp.jl index a337a7bd..7cc743ee 100644 --- a/src/special/exp.jl +++ b/src/special/exp.jl @@ -405,10 +405,9 @@ end const TABLE_EXP_64_1 = Vec(ntuple(j -> Core.VecElement(Float64(2.0^(big(j + 7) / 16))), Val(8))) - @inline target_trunc(v, ::VectorizationBase.True) = v - @inline target_trunc(v, ::VectorizationBase.False) = v % UInt32 - @inline target_trunc(v) = - target_trunc(v, VectorizationBase.has_feature(Val(:x86_64_avx512dq))) + @inline target_trunc(v, ::True) = v + @inline target_trunc(v, ::False) = v % UInt32 + @inline target_trunc(v) = target_trunc(v, has_feature(Val(:x86_64_avx512dq))) # @inline function vexp2_v1(x::AbstractSIMD{8,Float64}) # x16 = x @@ -618,10 +617,7 @@ end r = fma(N_float, LogBo256L(Val{B}(), Float64), r) # @show (N & 0x000000ff) % Int # @show N N & 0x000000ff - js = vload( - VectorizationBase.zero_offsets(stridedpointer(J_TABLE)), - (N & 0x000000ff,) - ) + js = vload(zero_offsets(stridedpointer(J_TABLE)), (N & 0x000000ff,)) # k = N >>> 0x00000008 # small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js)) small_part = vfmadd(js, expm1b_kernel(Val{B}(), r), js) @@ -784,10 +780,7 @@ end r = fast_fma(N_float, LogBo256U(Val{B}(), Float64), x, fma_fast()) r = fast_fma(N_float, LogBo256L(Val{B}(), Float64), r, fma_fast()) # @show (N & 0x000000ff) % Int - js = vload( - VectorizationBase.zero_offsets(stridedpointer(J_TABLE)), - (N & 0x000000ff,) - ) + js = vload(zero_offsets(stridedpointer(J_TABLE)), (N & 0x000000ff,)) k = N >>> 0x00000008 small_part = reinterpret(UInt64, vfmadd(js, expm1b_kernel(Val{B}(), r), js)) # return reinterpret(Float64, small_part), r, k, N_float, js diff --git a/src/special/misc.jl b/src/special/misc.jl index ed1665c7..ed30782a 100644 --- a/src/special/misc.jl +++ b/src/special/misc.jl @@ -220,7 +220,7 @@ end vload(stridedpointer(A), (i, j...)) end -@inline Base.Sort.midpoint( +@inline Base.midpoint( lo::AbstractSIMDVector{W,I}, hi::AbstractSIMDVector{W,I} ) where {W,I<:Integer} = lo + ((hi - lo) >>> 0x01) @@ -238,7 +238,7 @@ for TType in [:Integer, :(AbstractSIMDVector{W,<:Integer})] hi = hi + u st = lo < hi - u @inbounds while vany(st) - m = Base.Sort.midpoint(lo, hi) + m = Base.midpoint(lo, hi) b = Base.Order.lt(o, x, v[m]) & st hi = ifelse(b, m, hi) lo = ifelse(b, lo, m) diff --git a/src/static.jl b/src/static.jl index 7cc0f4f0..32499495 100644 --- a/src/static.jl +++ b/src/static.jl @@ -7,7 +7,7 @@ last(a) - first(a) + oneunit(T) @inline maybestaticrange(r::Base.OneTo{T}) where {T} = - ArrayInterface.OptionallyStaticUnitRange(StaticInt{1}(), last(r)) + Static.OptionallyStaticUnitRange(StaticInt{1}(), last(r)) @inline maybestaticrange(r::UnitRange) = r @inline maybestaticrange(r) = maybestaticfirst(r):maybestaticlast(r) @@ -21,7 +21,7 @@ ::Val{1} ) where {T,V<:AbstractVector{T}} = One() @inline maybestaticsize(A, ::Val{N}) where {N} = - ArrayInterface.static_size(A)[N] + StaticArrayInterface.static_size(A)[N] # These have versions that may allow for more optimizations, so we override base methods with a single `StaticInt` argument. for (f, ff) ∈ [ diff --git a/src/strided_pointers/cse_stridemultiples.jl b/src/strided_pointers/cse_stridemultiples.jl index 0a2f83ae..ab8cc8cb 100644 --- a/src/strided_pointers/cse_stridemultiples.jl +++ b/src/strided_pointers/cse_stridemultiples.jl @@ -22,10 +22,11 @@ end @inline offsetprecalc(x::StridedBitPointer, ::Val) = x # @inline pointerforcomparison(p::AbstractStridedPointer) = pointer(p) # @inline pointerforcomparison(p::AbstractStridedPointer, i) = gep(p, i) -@inline ArrayInterface.offsets(p::OffsetPrecalc) = offsets(getfield(p, :ptr)) +@inline StaticArrayInterface.offsets(p::OffsetPrecalc) = + offsets(getfield(p, :ptr)) @inline Base.strides(p::OffsetPrecalc) = static_strides(getfield(p, :ptr)) -@inline ArrayInterface.static_strides(p::OffsetPrecalc) = +@inline StaticArrayInterface.static_strides(p::OffsetPrecalc) = static_strides(getfield(p, :ptr)) @inline function LayoutPointers.similar_no_offset(sptr::OffsetPrecalc, ptr::Ptr) diff --git a/src/strided_pointers/stridedpointers.jl b/src/strided_pointers/stridedpointers.jl index 32cd1e12..ddf2926e 100644 --- a/src/strided_pointers/stridedpointers.jl +++ b/src/strided_pointers/stridedpointers.jl @@ -8,8 +8,6 @@ register_size() ) -using LayoutPointers: nopromote_axis_indicator - @inline _vload( ptr::AbstractStridedPointer{T,0}, i::Tuple{}, @@ -112,7 +110,7 @@ end ::StaticInt{RS} ) where {T,Nm1,I<:VecUnroll{Nm1},A<:StaticBool,RS} t = Expr(:tuple) - for n = 1:Nm1+1 + for n = 1:(Nm1+1) push!( t.args, :(_vload( @@ -572,7 +570,7 @@ function llvmptr_comp_quote(cmp, Tsym) else instrs = "%cmpi1 = icmp $cmp i8* %0, %1\n%cmpi8 = zext i1 %cmpi1 to i8\nret i8 %cmpi8" end - Expr( + Expr( :block, Expr(:meta, :inline), :($(Base.llvmcall)($instrs, Bool, Tuple{$pt,$pt}, p1, p2)) diff --git a/src/vecunroll/fmap.jl b/src/vecunroll/fmap.jl index c6ab2738..fc4f94e5 100644 --- a/src/vecunroll/fmap.jl +++ b/src/vecunroll/fmap.jl @@ -362,14 +362,14 @@ function collapse_expr(N, op, final) 2final end while N > _final - for n ∈ 1:N>>>1 + for n ∈ 1:(N>>>1) push!(q.args, Expr(:(=), s[n], Expr(:call, op, s[n], s[n+(N>>>1)]))) end isodd(N) && push!(q.args, Expr(:(=), s[1], Expr(:call, op, s[1], s[N]))) N >>>= 1 end if final != 1 - for n ∈ final+1:N + for n ∈ (final+1):N push!(q.args, Expr(:(=), s[n-final], Expr(:call, op, s[n-final], s[n]))) end t = Expr(:tuple) diff --git a/src/vecunroll/memory.jl b/src/vecunroll/memory.jl index 630f6537..487c7819 100644 --- a/src/vecunroll/memory.jl +++ b/src/vecunroll/memory.jl @@ -23,7 +23,7 @@ function unrolled_indicies( end inds = Vector{Expr}(undef, N) inds[1] = baseind - for n = 1:N-1 + for n = 1:(N-1) ind = copy(baseind) i = Expr(:call, Expr(:curly, :StaticInt, n * F)) if AU == AV && W > 1 @@ -180,7 +180,7 @@ function _shuffle_load_quote( return nothing if X > 0 mask_expr = :(mask(StaticInt{$W}(), 0, vmul_nw($UN, getfield(sm, :evl)))) - for n ∈ 1:UN-1 + for n ∈ 1:(UN-1) mask_expr = :(vcat( $mask_expr, mask(StaticInt{$W}(), $(n * W), vmul_nw($UN, getfield(sm, :evl))) @@ -190,14 +190,14 @@ function _shuffle_load_quote( else # FIXME return nothing - vrange = :(VectorizationBase.vrange( + vrange = :(vrange( Val{$W}(), $(integer_of_bytes(min(size_T, rs ÷ W))), Val{0}(), Val{-1}() )) mask_expr = :(($vrange + $(UN * W)) ≤ vmul_nw($UN, getfield(sm, :evl))) - for n ∈ UN-1:-1:1 + for n ∈ (UN-1):-1:1 mask_expr = :(vcat( $mask_expr, ($vrange + $(n * W)) ≤ vmul_nw($UN, getfield(sm, :evl)) @@ -208,8 +208,8 @@ function _shuffle_load_quote( end push!(q.args, :(v = $vloadexpr)) vut = Expr(:tuple) - Wrange = X > 0 ? (0:1:W-1) : (W-1:-1:0) - for n ∈ 0:UN-1 + Wrange = X > 0 ? (0:1:(W-1)) : ((W-1):-1:0) + for n ∈ 0:(UN-1) shufftup = Expr(:tuple) for w ∈ Wrange push!(shufftup.args, n + UN * w) @@ -256,8 +256,7 @@ function push_transpose_mask!( mm_evl_cmp = Symbol(:mm_evl_cmp_, n) if w == 1 isym = integer_of_bytes_symbol(min(4, RS ÷ n)) - vmmtyp = - :(VectorizationBase._vrange(Val{$n}(), $isym, Val{0}(), Val{1}())) + vmmtyp = :(_vrange(Val{$n}(), $isym, Val{0}(), Val{1}())) push!(q.args, :($mm_evl_cmp = $vmmtyp)) push!(q.args, :($mw_w = vmul_nw(_evl, $(UInt32(n))) > $mm_evl_cmp)) else @@ -1001,8 +1000,8 @@ function _shuffle_store_quote( Wtemp = Wnext end shufftup = Expr(:tuple) - for w ∈ ((X > 0) ? (0:1:W-1) : (W-1:-1:0)) - for n ∈ 0:UN-1 + for w ∈ ((X > 0) ? (0:1:(W-1)) : ((W-1):-1:0)) + for n ∈ 0:(UN-1) push!(shufftup.args, W * n + w) end end @@ -1117,7 +1116,7 @@ function vstore_transpose_quote( for nn ∈ 1:npartial push!(t.args, vds[i+nn]) end - for nn ∈ npartial+1:n + for nn ∈ (npartial+1):n # if W == 1 # push!(t.args, :(zero($Tsym))) # else @@ -2252,7 +2251,7 @@ function vload_double_unroll_quote( unroll = :(Unroll{$AUO,$FO,$NO,$AV,$W,$MO,$X}(Zero())) # tupvec = Vector{Expr}(undef, NI) vds = Vector{Symbol}(undef, NI) - for ui ∈ 0:NI-1 + for ui ∈ 0:(NI-1) if ui == 0 loadq = :(_vload_unroll(gptr, $unroll)) # VecUnroll($tup) else @@ -2286,7 +2285,7 @@ function vload_double_unroll_quote( else # we loop over `UO+1` and do the loads unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero())) tup = Expr(:tuple) - for uo ∈ 0:NO-1 + for uo ∈ 0:(NO-1) if uo == 0 loadq = :(_vload_unroll(gptr, $unroll)) else @@ -2473,7 +2472,7 @@ function vstore_double_unroll_quote( push!(q.args, :($vdt = getfield(getfield(vd, $t, false), 1))) end # tupvec = Vector{Expr}(undef, NI) - for ui ∈ 0:NI-1 + for ui ∈ 0:(NI-1) tup = Expr(:tuple) # tup = ui == 0 ? Expr(:tuple) : tupvec[ui+1] for t ∈ 1:NO @@ -2501,7 +2500,7 @@ function vstore_double_unroll_quote( end else # we loop over `UO+1` and do the stores unroll = :(Unroll{$AUI,$FI,$NI,$AV,$W,$MI,$X}(Zero())) - for uo ∈ 0:NO-1 + for uo ∈ 0:(NO-1) if uo == 0 storeq = :(_vstore_unroll!(gptr, getfield(vd, 1, false), $unroll)) else @@ -2993,10 +2992,10 @@ function transposeshuffle(split, W, offset::Bool) S = 1 << split i = offset ? S : 0 while w < W - for s ∈ 0:S-1 + for s ∈ 0:(S-1) push!(tup.args, w + s + i) end - for s ∈ 0:S-1 + for s ∈ 0:(S-1) # push!(tup.args, w + W + s) push!(tup.args, w + W + s + i) end @@ -3030,7 +3029,7 @@ function horizontal_reduce_store_expr( push!(q.args, :(gptr = gesp(ptr, $gf(u, :i)))) push!(q.args, :(bptr = pointer(gptr))) extractblock = Expr(:block) - vectors = [Symbol(:v_, n) for n ∈ 0:N-1] + vectors = [Symbol(:v_, n) for n ∈ 0:(N-1)] for n ∈ 1:N push!( extractblock.args, @@ -3090,7 +3089,7 @@ function horizontal_reduce_store_expr( v0, Expr( :call, - Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:Wh-1]...)) + Expr(:curly, :Val, Expr(:tuple, [w for w ∈ 0:(Wh-1)]...)) ) ), Expr( @@ -3099,7 +3098,7 @@ function horizontal_reduce_store_expr( v0, Expr( :call, - Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:Wt-1]...)) + Expr(:curly, :Val, Expr(:tuple, [w for w ∈ Wh:(Wt-1)]...)) ) ) ) @@ -3120,7 +3119,7 @@ function horizontal_reduce_store_expr( end if mask boolmask = Expr(:call, :Vec) - for n ∈ ncomp+1:ncomp+minWN + for n ∈ (ncomp+1):(ncomp+minWN) push!(boolmask.args, Expr(:call, gf, :masktuple, n, false)) end push!(storeexpr.args, Expr(:call, :tomask, boolmask)) @@ -3138,7 +3137,7 @@ function horizontal_reduce_store_expr( zeroexpr = Expr(:call, Expr(:curly, :StaticInt, 0)) ind = Expr(:tuple) foreach(_ -> push!(ind.args, zeroexpr), 1:D) - for n ∈ N+1:Ntotal + for n ∈ (N+1):Ntotal (n > N + 1) && (ind = copy(ind)) # copy to avoid overwriting old ind.args[AU] = Expr(:call, Expr(:curly, :StaticInt, F * (n - 1))) scalar = Expr(:call, reduct, Expr(:call, gf, :v, n, false)) @@ -3346,7 +3345,7 @@ function lazymulunroll_load_quote(M, O, N, maskall, masklast, align, rs) alignval = Expr(:call, align ? :True : :False) rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs)) gf = GlobalRef(Core, :getfield) - for n = 1:N+1 + for n = 1:(N+1) ind = if (M != 1) | (O != 0) :(LazyMulAdd{$M,$O}(u[$n])) else @@ -3489,7 +3488,7 @@ function lazymulunroll_store_quote( noaliasval = Expr(:call, noalias ? :True : :False) nontemporalval = Expr(:call, nontemporal ? :True : :False) rsexpr = Expr(:call, Expr(:curly, :StaticInt, rs)) - for n = 1:N+1 + for n = 1:(N+1) push!( q.args, Expr( @@ -3520,7 +3519,7 @@ end v = Base.FastMath.add_fast(s + mm) end t = Expr(:tuple, :v) - for n ∈ 1:N-1 + for n ∈ 1:(N-1) # push!(t.args, :(MM{$W,$W}(Base.FastMath.add_fast(s, $(T(n*W)))))) push!( t.args, @@ -3548,7 +3547,7 @@ end else Expr(:tuple, :v) end - for n ∈ 1:N-1 + for n ∈ 1:(N-1) M >>>= 1 if M % Bool push!( @@ -3583,7 +3582,7 @@ end z = zero(v) end t = Expr(:tuple, :(ifelse(getfield(m, $1, false), v, z))) - for n ∈ 1:N-1 + for n ∈ 1:(N-1) push!( t.args, :(ifelse( diff --git a/test/Project.toml b/test/Project.toml index 30d4cd7c..b12914bd 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,9 +1,15 @@ [deps] Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595" ArrayInterface = "4fba245c-0d91-5ea0-9b3e-6abc04ee57a9" +ExplicitImports = "7d51a73a-1435-4ff3-83d9-f097790105c7" +HostCPUFeatures = "3e5b6fbb-0976-4d2c-9146-d79de83f2fb0" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" OffsetArrays = "6fe1bfb0-de20-5000-8ca7-80f57d26f881" LayoutPointers = "10f19ff3-798f-405d-979b-55457f8fc047" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" -Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" \ No newline at end of file +Static = "aedffcd0-7271-4cad-89d0-dc628f76c6d3" +StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718" + +[compat] +ExplicitImports = "1.13.2" diff --git a/test/accuracy.jl b/test/accuracy.jl index 7c76aecf..20a8f849 100644 --- a/test/accuracy.jl +++ b/test/accuracy.jl @@ -115,8 +115,8 @@ function test_acc( reference = map(f2 ∘ big, xx) comp = similar(xx) i = 0 - spc = VectorizationBase.zstridedpointer(comp) - spx = VectorizationBase.zstridedpointer(xx) + spc = LayoutPointers.zstridedpointer(comp) + spx = LayoutPointers.zstridedpointer(xx) GC.@preserve xx comp begin while i < length(xx) vstore!(spc, f1(vload(spx, (MM{W}(i),))), (MM{W}(i),)) diff --git a/test/runtests.jl b/test/runtests.jl index c9cde993..baf81f78 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,4 +1,5 @@ -import InteractiveUtils, Aqua, ArrayInterface +import InteractiveUtils, Aqua, ArrayInterface, ExplicitImports +import HostCPUFeatures, LayoutPointers, Static, StaticArrayInterface InteractiveUtils.versioninfo(stdout; verbose = true) include("testsetup.jl") @@ -17,18 +18,57 @@ include("testsetup.jl") # 3. Use package extensions (still buggy in current Julia LTS v1.10.10) pirated_types = [ - VectorizationBase.FastRange, - VectorizationBase.AbstractStridedPointer, - VectorizationBase.StridedBitPointer, - VectorizationBase.StaticInt, - VectorizationBase.AbstractSIMD, - VectorizationBase.Bit, - ] - Aqua.test_all(VectorizationBase; deps_compat = deps_compat, piracies=(treat_as_own = pirated_types,)) + VectorizationBase.FastRange, + VectorizationBase.AbstractStridedPointer, + VectorizationBase.StridedBitPointer, + VectorizationBase.StaticInt, + VectorizationBase.AbstractSIMD, + VectorizationBase.Bit + ] + Aqua.test_all( + VectorizationBase; + deps_compat = deps_compat, + piracies = (treat_as_own = pirated_types,) + ) println("Aqua took $((time_ns() - t0)*1e-9) seconds") # @test isempty(detect_unbound_args(VectorizationBase)) # @test isempty(detect_ambiguities(VectorizationBase)) + @testset "ExplicitImports" begin + # No implicit imports (`using XY`) + @test ExplicitImports.check_no_implicit_imports(VectorizationBase) === + nothing + + # All explicit imports (`using XY: Z`) are loaded via their owners + @test ExplicitImports.check_all_explicit_imports_via_owners( + VectorizationBase + ) === nothing + + # No explicit imports (`using XY: Z`) of non-public names + @test_broken ExplicitImports.check_all_explicit_imports_are_public( + VectorizationBase + ) === nothing + + # No explicit imports (`using XY: Z`) that are not used + @test ExplicitImports.check_no_stale_explicit_imports(VectorizationBase) === + nothing + + # Nothing is accessed via modules other than its owner + @test ExplicitImports.check_all_qualified_accesses_via_owners( + VectorizationBase + ) === nothing + + # No accesses of non-public names + @test_broken ExplicitImports.check_all_qualified_accesses_are_public( + VectorizationBase + ) === nothing + + # No self-qualified accesses + @test ExplicitImports.check_no_self_qualified_accesses( + VectorizationBase + ) === nothing + end + W = Int(@inferred(VectorizationBase.pick_vector_width(Float64))) Sys.WORD_SIZE == 64 && @test @inferred(VectorizationBase.pick_integer(Val(W))) == ( @@ -100,12 +140,14 @@ include("testsetup.jl") @test VectorizationBase.align(i) == VectorizationBase.register_size() end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.align(i) == 2VectorizationBase.register_size() end for i ∈ (1:VectorizationBase.register_size()) .+ 9VectorizationBase.register_size() + @test VectorizationBase.align(i) == 10VectorizationBase.register_size() end for i ∈ 1:VectorizationBase.register_size() @@ -113,13 +155,15 @@ include("testsetup.jl") reinterpret(Ptr{Cvoid}, Int(VectorizationBase.register_size())) end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.align(reinterpret(Ptr{Cvoid}, i)) == reinterpret(Ptr{Cvoid}, 2Int(VectorizationBase.register_size())) end for i ∈ (1:VectorizationBase.register_size()) .+ 19VectorizationBase.register_size() + @test VectorizationBase.align(reinterpret(Ptr{Cvoid}, i)) == reinterpret(Ptr{Cvoid}, 20Int(VectorizationBase.register_size())) end @@ -130,7 +174,8 @@ include("testsetup.jl") W32 * cld(i, W32) end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.align(i, W32) == VectorizationBase.align(i, Float32) == VectorizationBase.align(i, Int32) == @@ -139,6 +184,7 @@ include("testsetup.jl") for i ∈ (1:VectorizationBase.register_size()) .+ 29VectorizationBase.register_size() + @test VectorizationBase.align(i, W32) == VectorizationBase.align(i, Float32) == VectorizationBase.align(i, Int32) == @@ -152,7 +198,8 @@ include("testsetup.jl") W64 * cld(i, W64) end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.align(i, W64) == VectorizationBase.align(i, Float64) == VectorizationBase.align(i, Int64) == @@ -161,6 +208,7 @@ include("testsetup.jl") for i ∈ (1:VectorizationBase.register_size()) .+ 29VectorizationBase.register_size() + @test VectorizationBase.align(i, W64) == VectorizationBase.align(i, Float64) == VectorizationBase.align(i, Int64) == @@ -170,16 +218,18 @@ include("testsetup.jl") @test reinterpret(Int, VectorizationBase.align(pointer(A))) % VectorizationBase.register_size() === 0 - for i ∈ 0:VectorizationBase.register_size()-1 + for i ∈ 0:(VectorizationBase.register_size()-1) @test VectorizationBase.aligntrunc(i) == 0 end for i ∈ - VectorizationBase.register_size():2VectorizationBase.register_size()-1 + VectorizationBase.register_size():(2VectorizationBase.register_size()-1) + @test VectorizationBase.aligntrunc(i) == VectorizationBase.register_size() end for i ∈ - (0:VectorizationBase.register_size()-1) .+ + (0:(VectorizationBase.register_size()-1)) .+ 9VectorizationBase.register_size() + @test VectorizationBase.aligntrunc(i) == 9VectorizationBase.register_size() end @@ -191,7 +241,8 @@ include("testsetup.jl") W32 * div(i, W32) end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.aligntrunc(i, W32) == VectorizationBase.aligntrunc(i, Float32) == VectorizationBase.aligntrunc(i, Int32) == @@ -200,6 +251,7 @@ include("testsetup.jl") for i ∈ (1:VectorizationBase.register_size()) .+ 29VectorizationBase.register_size() + @test VectorizationBase.aligntrunc(i, W32) == VectorizationBase.aligntrunc(i, Float32) == VectorizationBase.aligntrunc(i, Int32) == @@ -213,7 +265,8 @@ include("testsetup.jl") W64 * div(i, W64) end for i ∈ - 1+VectorizationBase.register_size():2VectorizationBase.register_size() + (1+VectorizationBase.register_size()):2VectorizationBase.register_size() + @test VectorizationBase.aligntrunc(i, W64) == VectorizationBase.aligntrunc(i, Float64) == VectorizationBase.aligntrunc(i, Int64) == @@ -222,6 +275,7 @@ include("testsetup.jl") for i ∈ (1:VectorizationBase.register_size()) .+ 29VectorizationBase.register_size() + @test VectorizationBase.aligntrunc(i, W64) == VectorizationBase.aligntrunc(i, Float64) == VectorizationBase.aligntrunc(i, Int64) == @@ -523,13 +577,13 @@ include("testsetup.jl") @test all(VectorizationBase._ispow2, 0:1) @test all( i -> - !any(VectorizationBase._ispow2, 1+(1<<(i-1)):(1< VectorizationBase.intlog2(1 << i) == i, - 0:(Int == Int64 ? 53 : 30) + 0:(Int==Int64 ? 53 : 30) ) FTypes = (Float32, Float64) Wv = ntuple( @@ -546,7 +600,7 @@ include("testsetup.jl") while true W >>= VectorizationBase.One() W == 0 && break - W2, Wshift2 = @inferred(VectorizationBase.pick_vector_width_shift(W, T)) + W2, Wshift2 = @inferred(HostCPUFeatures.pick_vector_width_shift(W, T)) @test W2 == VectorizationBase.One() << Wshift2 == @inferred(VectorizationBase.pick_vector_width(W, T)) == @@ -555,9 +609,8 @@ include("testsetup.jl") @test StaticInt(W) === VectorizationBase.pick_vector_width(Val(Int(W)), T) === VectorizationBase.pick_vector_width(W, T) - for n = W+1:2W - W3, Wshift3 = - VectorizationBase.pick_vector_width_shift(StaticInt(n), T) + for n = (W+1):2W + W3, Wshift3 = HostCPUFeatures.pick_vector_width_shift(StaticInt(n), T) @test W2 << 1 == W3 == 1 << (Wshift2 + 1) == @@ -602,7 +655,7 @@ include("testsetup.jl") dims = (41, 42, 43) .* 3 # dims = (41,42,43); - A = reshape(collect(Float64(0):Float64(prod(dims) - 1)), dims) + A = reshape(collect(Float64(0):Float64(prod(dims)-1)), dims) P = PermutedDimsArray(A, (3, 1, 2)) O = OffsetArray(P, (-4, -2, -3)) @@ -719,9 +772,9 @@ include("testsetup.jl") @test v2 === VectorizationBase.data(vu)[2] @test v3 === VectorizationBase.data(vu)[3] - ir = 0:(AV == 1 ? W64 - 1 : 0) - jr = 0:(AV == 2 ? W64 - 1 : 0) - kr = 0:(AV == 3 ? W64 - 1 : 0) + ir = 0:(AV==1 ? W64-1 : 0) + jr = 0:(AV==2 ? W64-1 : 0) + kr = 0:(AV==3 ? W64-1 : 0) x1 = getindex.(Ref(B), i .+ ir, j .+ jr, k .+ kr) if AU == 1 ir = ir .+ length(ir) @@ -757,9 +810,9 @@ include("testsetup.jl") VectorizationBase.Unroll{AU,1,5,0,1,zero(UInt)}((i, j, k)) ) end - ir = 0:(AU == 1 ? 4 : 0) - jr = 0:(AU == 2 ? 4 : 0) - kr = 0:(AU == 3 ? 4 : 0) + ir = 0:(AU==1 ? 4 : 0) + jr = 0:(AU==2 ? 4 : 0) + kr = 0:(AU==3 ? 4 : 0) xvs = getindex.(Ref(B), i .+ ir, j .+ jr, k .+ kr) @test xvs ≈ map(VectorizationBase.vsum, [v1, v2, v3, v4, v5]) end @@ -771,9 +824,9 @@ include("testsetup.jl") pointer(x), j, (i * VectorizationBase.static_sizeof(Int)), - VectorizationBase.False(), - VectorizationBase.False(), - VectorizationBase.False(), + Static.False(), + Static.False(), + Static.False(), VectorizationBase.register_size() ) i += 1 @@ -784,9 +837,9 @@ include("testsetup.jl") j, (VectorizationBase.static_sizeof(Int) * i), Mask{1}(0xff), - VectorizationBase.False(), - VectorizationBase.False(), - VectorizationBase.False(), + Static.False(), + Static.False(), + Static.False(), VectorizationBase.register_size() ) i += 1 @@ -796,9 +849,9 @@ include("testsetup.jl") pointer(x), j, VectorizationBase.lazymul(i, VectorizationBase.static_sizeof(Int)), - VectorizationBase.False(), - VectorizationBase.False(), - VectorizationBase.False(), + Static.False(), + Static.False(), + Static.False(), VectorizationBase.register_size() ) i += 1 @@ -809,9 +862,9 @@ include("testsetup.jl") j, VectorizationBase.lazymul(VectorizationBase.static_sizeof(Int), i), Mask{1}(0xff), - VectorizationBase.False(), - VectorizationBase.False(), - VectorizationBase.False(), + Static.False(), + Static.False(), + Static.False(), VectorizationBase.register_size() ) i += 1 @@ -893,21 +946,21 @@ include("testsetup.jl") SizedWrapper{M,N}(A::AT) where {M,N,T,AT<:AbstractMatrix{T}} = SizedWrapper{M,N,T,AT}(A) Base.size(::SizedWrapper{M,N}) where {M,N} = (M, N) - VectorizationBase.static_size(::SizedWrapper{M,N}) where {M,N} = + StaticArrayInterface.static_size(::SizedWrapper{M,N}) where {M,N} = (StaticInt(M), StaticInt(N)) Base.getindex(A::SizedWrapper, i...) = getindex(parent(A), i...) Base.parent(dw::SizedWrapper) = dw.A VectorizationBase.ArrayInterface.parent_type( ::Type{SizedWrapper{M,N,T,AT}} ) where {M,N,T,AT} = AT - VectorizationBase.memory_reference(dw::SizedWrapper) = - VectorizationBase.memory_reference(parent(dw)) - VectorizationBase.contiguous_axis(::Type{A}) where {A<:SizedWrapper} = - VectorizationBase.contiguous_axis( + LayoutPointers.memory_reference(dw::SizedWrapper) = + LayoutPointers.memory_reference(parent(dw)) + StaticArrayInterface.contiguous_axis(::Type{A}) where {A<:SizedWrapper} = + StaticArrayInterface.contiguous_axis( VectorizationBase.ArrayInterface.parent_type(A) ) - VectorizationBase.contiguous_batch_size(dw::SizedWrapper) = - VectorizationBase.contiguous_batch_size(parent(dw)) + StaticArrayInterface.contiguous_batch_size(dw::SizedWrapper) = + StaticArrayInterface.contiguous_batch_size(parent(dw)) VectorizationBase.stride_rank(::Type{A}) where {A<:SizedWrapper} = VectorizationBase.stride_rank( VectorizationBase.ArrayInterface.parent_type(A) @@ -936,11 +989,11 @@ include("testsetup.jl") At = ai ? A : (similar(A')') Bt = bi ? B : (similar(B')') Ct = ci ? C : (similar(C')') - spdw = VectorizationBase.DensePointerWrapper{(true, true)}( + spdw = LayoutPointers.DensePointerWrapper{(true, true)}( VectorizationBase.stridedpointer(At) ) gsp, pres = @inferred( - VectorizationBase.grouped_strided_pointer( + LayoutPointers.grouped_strided_pointer( (spdw, Bt, Ct), Val{(((1, 1), (3, 1)), ((1, 2), (2, 1)), ((2, 2), (3, 2)))}() ) @@ -952,13 +1005,13 @@ include("testsetup.jl") @test sizeof(gsp) == sizeof(Int) * (6 - (ai & ci) - ((!ai) & bi) - ((!bi) & (!ci))) @test sizeof(gsp.offsets) == 0 - pA, pB, pC = @inferred(VectorizationBase.stridedpointers(gsp)) + pA, pB, pC = @inferred(LayoutPointers.stridedpointers(gsp)) @test pA === stridedpointer(At) @test pB === stridedpointer(Bt) @test pC === stridedpointer(Ct) Btsw = SizedWrapper{K,N}(Bt) gsp2, pres2 = @inferred( - VectorizationBase.grouped_strided_pointer( + LayoutPointers.grouped_strided_pointer( (At, Btsw, Ct), Val{(((1, 1), (3, 1)), ((1, 2), (2, 1)), ((2, 2), (3, 2)))}() ) @@ -966,7 +1019,7 @@ include("testsetup.jl") @test sizeof(gsp2) == sizeof(Int) * (5 - (ai & ci) - ((!ai) & bi) - ((!bi) & (!ci))) - pA2, pB2, pC2 = @inferred(VectorizationBase.stridedpointers(gsp2)) + pA2, pB2, pC2 = @inferred(LayoutPointers.stridedpointers(gsp2)) @test pointer(pA2) == pointer(At) @test pointer(pB2) == pointer(Bt) @test pointer(pC2) == pointer(Ct) @@ -977,10 +1030,10 @@ include("testsetup.jl") end data_in_large = Array{Float64}(undef, 4, 4, 4, 4, 1) - data_in = view(data_in_large, :, 1, :, :, 1) + data_in = view(data_in_large,:,1,:,:,1) tmp1 = Array{Float64}(undef, 4, 4, 4) - sp_data_in, sp_tmp1 = VectorizationBase.stridedpointers( - VectorizationBase.grouped_strided_pointer( + sp_data_in, sp_tmp1 = LayoutPointers.stridedpointers( + LayoutPointers.grouped_strided_pointer( (data_in, tmp1), Val((((1, 1), (2, 1)),)) )[1] @@ -1263,8 +1316,10 @@ include("testsetup.jl") Vec(ntuple(_ -> Core.VecElement(rand(I1)), Val(WI))) )) srange = - one(I2):(Bool(VectorizationBase.has_feature(Val(:x86_64_avx512dq))) ? - I2(8sizeof(I1) - 1) : I2(31)) + one( + I2 + ):(Bool(VectorizationBase.has_feature(Val(:x86_64_avx512dq))) ? + I2(8sizeof(I1)-1) : I2(31)) vi2 = VectorizationBase.VecUnroll(( Vec(ntuple(_ -> Core.VecElement(rand(srange)), Val(WI))), Vec(ntuple(_ -> Core.VecElement(rand(srange)), Val(WI))), @@ -1453,10 +1508,10 @@ include("testsetup.jl") end vi2 = VectorizationBase.VecUnroll(( - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(WI))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(WI))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(WI))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(WI))) + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(WI))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(WI))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(WI))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(WI))) )) vones, vi2f, vtwos = promote(1.0, vi2, 2.0f0) # promotes a binary function, right? Even when used with three args? @test vones === VectorizationBase.VecUnroll(( @@ -1536,9 +1591,9 @@ include("testsetup.jl") @test tovector(clamp(m1, 2:i)) == clamp.(tovector(m1), 2, i) @test tovector(mod(m1, 1:i)) == mod1.(tovector(m1), i) - @test VectorizationBase.vdivrem.(1:30, 1:30') == divrem.(1:30, 1:30') - @test VectorizationBase.vcld.(1:30, 1:30') == cld.(1:30, 1:30') - @test VectorizationBase.vrem.(1:30, 1:30') == rem.(1:30, 1:30') + @test VectorizationBase.vdivrem.(1:30, 1:(30')) == divrem.(1:30, 1:(30')) + @test VectorizationBase.vcld.(1:30, 1:(30')) == cld.(1:30, 1:(30')) + @test VectorizationBase.vrem.(1:30, 1:(30')) == rem.(1:30, 1:(30')) @test gcd(Vec(42, 64, 0, -37), Vec(18, 96, -38, 0)) === Vec(6, 32, 38, 37) @test lcm(Vec(24, 16, 42, 0), Vec(18, 12, 18, 17)) === Vec(72, 48, 126, 0) @@ -2028,27 +2083,30 @@ include("testsetup.jl") 1 )) ) === StaticInt{8}() - @test VectorizationBase.CartesianVIndex((StaticInt(-4), StaticInt(7))):VectorizationBase.CartesianVIndex(( - StaticInt(14), - StaticInt(73) - )) === CartesianIndices(( + @test VectorizationBase.CartesianVIndex(( + StaticInt(-4), + StaticInt(7) + )):VectorizationBase.CartesianVIndex((StaticInt(14), StaticInt(73))) === + CartesianIndices(( StaticInt(-4):StaticInt(14), StaticInt(7):StaticInt(73) )) - @test VectorizationBase.maybestaticfirst(CartesianIndices(A)):VectorizationBase.maybestaticlast( + @test VectorizationBase.maybestaticfirst( CartesianIndices(A) - ) == CartesianIndices(A) - @test VectorizationBase.maybestaticfirst(CartesianIndices(A)):VectorizationBase.maybestaticlast( + ):VectorizationBase.maybestaticlast(CartesianIndices(A)) == + CartesianIndices(A) + @test VectorizationBase.maybestaticfirst( CartesianIndices(A) - ) === CartesianIndices(map(i -> VectorizationBase.One():i, size(A))) + ):VectorizationBase.maybestaticlast(CartesianIndices(A)) === + CartesianIndices(map(i -> VectorizationBase.One():i, size(A))) end println("Promotion") @time @testset "Promotion" begin vi2 = VectorizationBase.VecUnroll(( - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(W64))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(W64))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(W64))), - Vec(ntuple(_ -> Core.VecElement(rand(1:M-1)), Val(W64))) + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(W64))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(W64))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(W64))), + Vec(ntuple(_ -> Core.VecElement(rand(1:(M-1))), Val(W64))) )) vones, vi2f, vtwos = @inferred(promote(1.0, vi2, 2.0f0)) # promotes a binary function, right? Even when used with three args? @test vones === VectorizationBase.VecUnroll(( @@ -2299,7 +2357,7 @@ include("testsetup.jl") ) === typemax(Int32) v = Vec( ntuple( - _ -> rand(typemax(UInt)>>1+one(UInt):typemax(UInt)), + _ -> rand((typemax(UInt)>>1+one(UInt)):typemax(UInt)), VectorizationBase.pick_vector_width(UInt) )... ) diff --git a/test/testsetup.jl b/test/testsetup.jl index 5affe0cd..bb4568a7 100644 --- a/test/testsetup.jl +++ b/test/testsetup.jl @@ -15,14 +15,14 @@ function tovector(u::VectorizationBase.VecUnroll{_N,W,_T}) where {_N,W,_T} x = Vector{T}(undef, N * W) for n ∈ 1:N v = VectorizationBase.data(u)[n] - for w ∈ 0:W-1 + for w ∈ 0:(W-1) x[(i+=1)] = VectorizationBase.extractelement(v, w) end end x end tovector(v::VectorizationBase.AbstractSIMDVector{W}) where {W} = - [VectorizationBase.extractelement(v, w) for w ∈ 0:W-1] + [VectorizationBase.extractelement(v, w) for w ∈ 0:(W-1)] tovector(v::VectorizationBase.LazyMulAdd) = tovector(VectorizationBase._materialize(v)) tovector(x) = x