From c42c08cc413e628515417fa4cb35104eca49c5a2 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Tue, 18 Feb 2020 16:21:28 -0800 Subject: [PATCH 1/8] Add in-place multiply-add --- src/StaticArrays.jl | 1 + src/matrix_multiply.jl | 7 +- src/matrix_multiply_add.jl | 172 ++++++++++++++++++++++++++++++++++++ test/matrix_multiply_add.jl | 90 +++++++++++++++++++ test/runtests.jl | 1 + 5 files changed, 268 insertions(+), 3 deletions(-) create mode 100644 src/matrix_multiply_add.jl create mode 100644 test/matrix_multiply_add.jl diff --git a/src/StaticArrays.jl b/src/StaticArrays.jl index 851cc88e..e2027829 100644 --- a/src/StaticArrays.jl +++ b/src/StaticArrays.jl @@ -126,6 +126,7 @@ include("mapreduce.jl") include("arraymath.jl") include("linalg.jl") include("matrix_multiply.jl") +include("matrix_multiply_add.jl") include("det.jl") include("inv.jl") include("solve.jl") diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index 15996b05..c55978c5 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -285,7 +285,10 @@ end end -@generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, ::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}) where {s,sa,sb, T <: BlasFloat} +@generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, + ::Size{sa}, ::Size{sb}, + a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}, + alpha::Real=one(T), beta::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s")) end @@ -316,8 +319,6 @@ end end return quote - alpha = one(T) - beta = zero(T) transA = 'N' transB = 'N' m = $(sa[1]) diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl new file mode 100644 index 00000000..1cc8a217 --- /dev/null +++ b/src/matrix_multiply_add.jl @@ -0,0 +1,172 @@ + +# 5-argument matrix multiplication +@inline LinearAlgebra.mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticVecOrMat, α::Real, β::Real) = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) +@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}, α::Real, β::Real) = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) +@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}, α::Real, β::Real) = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) + +# Matrix-matrix multiplication +@generated function _mul!(Sc::Size{sc}, c::StaticMatrix{<:Any, <:Any, Tc}, + Sa::Size{sa}, Sb::Size{sb}, + a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticMatrix{<:Any, <:Any, Tb}, + α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} + can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat + + if can_blas + if sa[1] * sa[2] * sb[2] < 4*4*4 + return quote + @_inline_meta + muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + else + return quote + @_inline_meta + mul_blas!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + end + else + if sa[1] * sa[2] * sb[2] < 4*4*4 + return quote + @_inline_meta + mul_unrolled!(Sc, c, Sa, Sb, a, b) + return c + end + else + return quote + @_inline_meta + mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) + return c + end + end + end +end + +# Matrix-vector multiplication +@generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, + a::StaticMatrix, b::StaticVector, α::Real, β::Real) where {sa, sb, sc} + if sb[1] != sa[2] || sc[1] != sa[1] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + exprs = [:(c[$k] = β * c[$k] + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] + else + exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] + end + + return quote + @_inline_meta + @inbounds $(Expr(:block, exprs...)) + return c + end +end + +# Outer product +@generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector, + b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}, + α::Real, β::Real) where {sa, sb, sc} + if sa[1] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + exprs = [:(c[$(LinearIndices(sc)[i, j])] = β * c[$(LinearIndices(sc)[i, j])] + α * + a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] + + return quote + @_inline_meta + @inbounds $(Expr(:block, exprs...)) + return c + end +end + +@generated function muladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, + a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} + if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * + $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]] + ))) for k1 = 1:sa[1], k2 = 1:sb[2]] + else + exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] + end + + return quote + @_inline_meta + @inbounds $(Expr(:block, exprs...)) + end +end + +@generated function muladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, + ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix, + α::Real, β::Real) where {sa, sb, sc} + if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] + + # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster + tmp_type = SVector{sb[1], eltype(c)} + vect_exprs = [:($(Symbol("tmp_$k2")) = + partly_unrolled_multiply($(Size(sa)), $(Size(sb[1])), a, + $(Expr(:call, tmp_type, + [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] + + exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * + $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + + return quote + @_inline_meta + @inbounds $(Expr(:block, vect_exprs...)) + @inbounds $(Expr(:block, exprs...)) + end +end + +# Special-case SizedMatrix +@inline mul!(dest::SizedMatrix{<:Any, <:Any, Tc}, A::SizedMatrix{<:Any, <:Any, Ta}, + B::SizedMatrix{<:Any, <:Any, Tb}) where {Ta,Tb,Tc} = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, one(Ta), zero(Ta)) + +@generated function _mul!(Sc::Size{sc}, c::SizedMatrix{<:Any, <:Any, Tc}, + Sa::Size{sa}, Sb::Size{sb}, + a::SizedMatrix{<:Any, <:Any, Ta}, b::SizedMatrix{<:Any, <:Any, Tb}, + α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} + can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat + + if can_blas + if sa[1] * sa[2] * sb[2] < 4*4*4 + return quote + @_inline_meta + muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + else + return quote + # @_inline_meta + BLAS.gemm!('N','N', α, a.data, b.data, β, c.data) + return c + end + end + end +end diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl new file mode 100644 index 00000000..c4c11d7d --- /dev/null +++ b/test/matrix_multiply_add.jl @@ -0,0 +1,90 @@ + +function test_muladd(a,b,c, α, β) + c0 = copy(c) + mul!(c,a,b,α,β) + @test c ≈ a*b*α + c0*β + c .= c0 + mul!(c,a,b) + @test c ≈ a*b + @test (@allocated mul!(c,a,b)) == 0 + @test (@allocated mul!(c,a,b,α,β)) == 0 +end + +@testset "Matrix multiply-add" begin + α,β = 2.0, 1.0 + + # Test matrix multiplication + N = 3 + a = @MMatrix rand(N,N) + b = @MMatrix rand(N,N) + c = @MMatrix rand(N,N) + test_muladd(a, b, c, α, β) + test_muladd(a, b, c, 1, β) + test_muladd(a, b, c, α, 2) + test_muladd(a, b, c, α, Float32(3)) + + N = 5 + a = @MMatrix rand(N,N) + b = @MMatrix rand(N,N) + c = @MMatrix rand(N,N) + test_muladd(a, b, c, α, β) + + N = 15 + a = @MMatrix rand(N,N) + b = @MMatrix rand(N,N) + c = @MMatrix rand(N,N) + test_muladd(a, b, c, α, β) + + N = 3 + a = SizedMatrix{N,N}(rand(N,N)) + b = SizedMatrix{N,N}(rand(N,N)) + c = SizedMatrix{N,N}(rand(N,N)) + test_muladd(a, b, c, α, β) + + N = 5 + a = SizedMatrix{N,N}(rand(N,N)) + b = SizedMatrix{N,N}(rand(N,N)) + c = SizedMatrix{N,N}(rand(N,N)) + test_muladd(a, b, c, α, β) + + N = 15 + a = SizedMatrix{N,N}(rand(N,N)) + b = SizedMatrix{N,N}(rand(N,N)) + c = SizedMatrix{N,N}(rand(N,N)) + test_muladd(a, b, c, α, β) + + # Test matrix-vector multiplication + N = 15 + a = @MMatrix rand(N,N) + b = @MVector rand(N) + c = @MVector rand(N) + test_muladd(a,b,c,α,β) + + N = 15 + a = rand(SizedMatrix{N,N}) + b = rand(SizedVector{N}) + c = rand(SizedVector{N}) + test_muladd(a,b,c,α,β) + + # Test outer products + N = 5 + a = @MVector rand(N) + b = @MVector rand(N) + c = @MMatrix rand(N,N) + test_muladd(a, b', c, α, β) + + N = 15 + a = rand(SizedVector{N}) + b = rand(SizedVector{N}) + c = rand(SizedMatrix{N,N}) +end + +#= +Summary: +All MMatrix sizes run without allocations +All MMatrix sizes have low compile times + +Slow compilation: + MMatrix initialization + MMatrix multiplication (not in place) +=# diff --git a/test/runtests.jl b/test/runtests.jl index 34b508fc..3a98c8e1 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -38,6 +38,7 @@ Random.seed!(42); include("arraymath.jl") include("broadcast.jl") include("linalg.jl") Random.seed!(42); include("matrix_multiply.jl") +Random.seed!(42); include("matrix_multiply_add.jl") Random.seed!(42); include("triangular.jl") include("det.jl") include("inv.jl") From f2ba7b027113a6e33cc80b6979bdbf0a09dc1db7 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Wed, 19 Feb 2020 15:17:51 -0800 Subject: [PATCH 2/8] Add in-place A'B --- src/matrix_multiply.jl | 24 +++- src/matrix_multiply_add.jl | 227 +++++++++++++++++++++++++----------- test/matrix_multiply_add.jl | 79 ++++++++----- 3 files changed, 232 insertions(+), 98 deletions(-) diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index c55978c5..98cbfb1b 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -4,7 +4,7 @@ import LinearAlgebra: BlasFloat, matprod, mul! # Manage dispatch of * and mul! # TODO Adjoint? (Inner product?) -@inline *(A::StaticMatrix, B::AbstractVector) = _mul(Size(A), A, B) +@inline *(A::StaticMatrix, B::AbwtractVector) = _mul(Size(A), A, B) @inline *(A::StaticMatrix, B::StaticVector) = _mul(Size(A), Size(B), A, B) @inline *(A::StaticMatrix, B::StaticMatrix) = _mul(Size(A), Size(B), A, B) @inline *(A::StaticVector, B::StaticMatrix) = *(reshape(A, Size(Size(A)[1], 1)), B) @@ -19,6 +19,10 @@ import LinearAlgebra: BlasFloat, matprod, mul! @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) #@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb}) +@inline mul!(dest::StaticMatrix, A::Transpose{<:Any,<:StaticMatrix}, B::StaticMatrix) = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, true, false) +@inline mul!(dest::StaticVector, A::Transpose{<:Any,<:StaticMatrix}, B::StaticVector) = + _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, true, false) @@ -210,6 +214,7 @@ end end end + # TODO aliasing problems if c === b? @generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticVector) where {sa, sb, sc} if sb[1] != sa[2] || sc[1] != sa[1] @@ -284,6 +289,23 @@ end end end +@inline mul_blas!(Sc::Size{s}, c::SizedArray{<:Any,T}, + Sa::Size{sa}, Sb::Size{sb}, + a::SizedArray{<:Any,T}, b::SizedArray{<:Any,T}, + α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = + BLAS.gemm!('N','N', α, a.data, b.data, β , c.data) + +@inline mul_blas!(Sc::Size{s}, c::SizedArray{<:Any,T}, + Sa::Size{sa}, Sb::Size{sb}, + a::Transpose{<:Any, <:SizedArray}, b::SizedArray{<:Any,T}, + α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = + BLAS.gemm!('T','N', α, a.parent.data, b.data, β , c.data) + +@inline mul_blas!(Sc::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, + Sa::Size{sa}, Sb::Size{sb}, + a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix{<:Any, <:Any, T}, + α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = + BLAS.gemm!('T','N', α, a.parent, b, β , c) @generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, ::Size{sa}, ::Size{sb}, diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl index 1cc8a217..662013e0 100644 --- a/src/matrix_multiply_add.jl +++ b/src/matrix_multiply_add.jl @@ -7,44 +7,52 @@ @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}, α::Real, β::Real) = _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) +@inline mul!(dest::StaticVector, A::Transpose{<:Any, <:StaticMatrix}, B::StaticVector, α::Real, β::Real) = + _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, α, β) +@inline mul!(dest::StaticMatrix, A::Transpose{<:Any, <:StaticMatrix}, B::StaticMatrix, α::Real, β::Real) = + _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) + +@inline multiplied_dimension(::Type{A}, ::Type{B}) where { + A<:Union{StaticMatrix,Transpose{<:Any,<:StaticMatrix}}, B<:StaticMatrix} = + prod(size(A)) * size(B,2) + +@inline multiplied_dimension(::Type{A}, ::Type{B}) where { + A<:Union{StaticMatrix,Transpose{<:Any,<:StaticMatrix}}, + B<:Transpose{<:Any, <:StaticMatrix}} = + prod(size(A)) * size(B,2) + # Matrix-matrix multiplication -@generated function _mul!(Sc::Size{sc}, c::StaticMatrix{<:Any, <:Any, Tc}, +@generated function _mul!(Sc::Size{sc}, c::AbstractMatrix, Sa::Size{sa}, Sb::Size{sb}, - a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticMatrix{<:Any, <:Any, Tb}, - α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} + a::AbstractMatrix, b::AbstractMatrix, + α::Real, β::Real) where {sa, sb, sc} + Ta,Tb,Tc = eltype(a), eltype(b), eltype(c) can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat - if can_blas - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) - return quote - @_inline_meta - muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - else - return quote - @_inline_meta - mul_blas!(Sc, c, Sa, Sb, a, b, α, β) - return c - end + mult_dim = multiplied_dimension(a,b) + if sa[1] * sa[2] * sb[2] < 4*4*4 + return quote + @_inline_meta + muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) + return c + end + elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) + return c end else - if sa[1] * sa[2] * sb[2] < 4*4*4 + if can_blas return quote @_inline_meta - mul_unrolled!(Sc, c, Sa, Sb, a, b) + mul_blas!(Sc, c, Sa, Sb, a, b, α, β) return c end else return quote @_inline_meta - mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) return c end end @@ -52,14 +60,15 @@ end # Matrix-vector multiplication -@generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, - a::StaticMatrix, b::StaticVector, α::Real, β::Real) where {sa, sb, sc} +@generated function _mul!(::Size{sc}, c::StaticVecOrMat, ::Size{sa}, ::Size{sb}, + a::StaticMatrix, b::StaticVector, α::Real, β::Real, ::Val{col}=Val(1)) where {sa, sb, sc,col} if sb[1] != sa[2] || sc[1] != sa[1] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end if sa[2] != 0 - exprs = [:(c[$k] = β * c[$k] + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + exprs = [:(c[$(LinearIndices(sc)[k,col])] = β * c[$(LinearIndices(sc)[k,col])] + + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] else exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] @@ -72,6 +81,27 @@ end end end +@generated function _tmul!(::Size{sc}, c::StaticVecOrMat, ::Size{sa}, ::Size{sb}, + a::StaticMatrix, b::StaticVector, α::Real, β::Real, ::Val{col}=Val(1)) where {sa, sb, sc,col} + if sb[1] != sa[1] || sc[1] != sa[2] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + exprs = [:(c[$(LinearIndices(sc)[k,col])] = β * c[$(LinearIndices(sc)[k,col])] + + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:(a[$(LinearIndices(sa)[j, k])]*b[$j]) for j = 1:sa[1]]))) for k = 1:sa[2]] + else + exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] + end + + return quote + @_inline_meta + @inbounds $(Expr(:block, exprs...)) + return c + end +end + # Outer product @generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector, b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}, @@ -90,7 +120,16 @@ end end end -@generated function muladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, + +@inline muladd_unrolled!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, + a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = + _muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) + +@inline muladd_unrolled!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, + a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = + _tmuladd_unrolled!(Sc, c, Size(a.parent), Sb, a.parent, b, α, β) + +@generated function _muladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) @@ -111,8 +150,39 @@ end end end -@generated function muladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, - ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix, +@generated function _tmuladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, + a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} + if sb[1] != sa[1] || sa[2] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $(reverse(sa)) and $sb and assign to array of size $sc")) + end + + if sa[2] != 0 + exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * + $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:(a[$(LinearIndices(sa)[j, k1])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[1]] + ))) for k1 = 1:sa[2], k2 = 1:sb[2]] + else + exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] + end + + return quote + @_inline_meta + @inbounds $(Expr(:block, exprs...)) + end +end + + +@inline muladd_unrolled_chunks!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, + a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = + _muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) + +@inline muladd_unrolled_chunks!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, + a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = + _tmuladd_unrolled_chunks!(Sc, c, Size(a.parent), Sb, a.parent, b, α, β) + +@generated function _muladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, + ::Size{sa}, ::Size{sb}, + a::Union{StaticMatrix, Transpose{<:Any, <:StaticMatrix}}, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) @@ -122,51 +192,72 @@ end # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster tmp_type = SVector{sb[1], eltype(c)} - vect_exprs = [:($(Symbol("tmp_$k2")) = - partly_unrolled_multiply($(Size(sa)), $(Size(sb[1])), a, - $(Expr(:call, tmp_type, - [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * - $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + col_mult = [:($(Symbol("tmp_$k2")) = + _mul!($(Size(sc)), c, $(Size(sa)), $(Size(sb[1])), a, + $(Expr(:call, tmp_type, + [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)),α,β,Val($k2))) for k2 = 1:sb[2]] return quote @_inline_meta - @inbounds $(Expr(:block, vect_exprs...)) - @inbounds $(Expr(:block, exprs...)) + return $(Expr(:block, col_mult...)) end end -# Special-case SizedMatrix -@inline mul!(dest::SizedMatrix{<:Any, <:Any, Tc}, A::SizedMatrix{<:Any, <:Any, Ta}, - B::SizedMatrix{<:Any, <:Any, Tb}) where {Ta,Tb,Tc} = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, one(Ta), zero(Ta)) +@generated function _tmuladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, + ::Size{sa}, ::Size{sb}, + a::Union{StaticMatrix, Transpose{<:Any, <:StaticMatrix}}, b::StaticMatrix, + α::Real, β::Real) where {sa, sb, sc} + if sb[1] != sa[1] || sa[2] != sc[1] || sb[2] != sc[2] + throw(DimensionMismatch("Tried to multiply arrays of size $(reverse(sa)) and $sb and assign to array of size $sc")) + end -@generated function _mul!(Sc::Size{sc}, c::SizedMatrix{<:Any, <:Any, Tc}, - Sa::Size{sa}, Sb::Size{sb}, - a::SizedMatrix{<:Any, <:Any, Ta}, b::SizedMatrix{<:Any, <:Any, Tb}, - α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} - can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat + #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] - if can_blas - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) - return quote - @_inline_meta - muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - else - return quote - # @_inline_meta - BLAS.gemm!('N','N', α, a.data, b.data, β, c.data) - return c - end - end + # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster + tmp_type = SVector{sb[1], eltype(c)} + + col_mult = [:($(Symbol("tmp_$k2")) = + _tmul!($(Size(sc)), c, $(Size(sa)), $(Size(sb[1])), a, + $(Expr(:call, tmp_type, + [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)),α,β,Val($k2))) for k2 = 1:sb[2]] + + return quote + @_inline_meta + return $(Expr(:block, col_mult...)) end end + +# Special-case SizedMatrix +# @inline mul!(dest::SizedMatrix{<:Any, <:Any, Tc}, A::SizedMatrix{<:Any, <:Any, Ta}, +# B::SizedMatrix{<:Any, <:Any, Tb}) where {Ta,Tb,Tc} = +# _mul!(Size(dest), dest, Size(A), Size(B), A, B, one(Ta), zero(Ta)) +# +# @generated function _mul!(Sc::Size{sc}, c::SizedMatrix{<:Any, <:Any, Tc}, +# Sa::Size{sa}, Sb::Size{sb}, +# a::SizedMatrix{<:Any, <:Any, Ta}, b::SizedMatrix{<:Any, <:Any, Tb}, +# α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} +# can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat +# +# if can_blas +# if sa[1] * sa[2] * sb[2] < 4*4*4 +# return quote +# @_inline_meta +# muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) +# return c +# end +# elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) +# return quote +# @_inline_meta +# muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) +# return c +# end +# else +# return quote +# # @_inline_meta +# BLAS.gemm!('N','N', α, a.data, b.data, β, c.data) +# return c +# end +# end +# end +# end diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index c4c11d7d..f9bdb8b5 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -1,56 +1,76 @@ -function test_muladd(a,b,c, α, β) +function test_muladd(a,b,c, α, β, test_transpose=true) c0 = copy(c) - mul!(c,a,b,α,β) - @test c ≈ a*b*α + c0*β - c .= c0 + b0 = copy(b) + mul!(c,a,b) @test c ≈ a*b + + c .= c0 + b .= b0 + mul!(c,a,b,α,β) + @test c ≈ a*b*α + c0*β + @test (@allocated mul!(c,a,b)) == 0 @test (@allocated mul!(c,a,b,α,β)) == 0 + + if test_transpose + c .= c0 + b .= b0 + mul!(b, Transpose(a), c) + @test b ≈ a'c + + c .= c0 + b .= b0 + mul!(b,Transpose(a),c,α,β) + @test b ≈ a'c*α + b0*β + + @test (@allocated mul!(b,Transpose(a),c)) == 0 + @test (@allocated mul!(b,Transpose(a),c,α,β)) == 0 + end end @testset "Matrix multiply-add" begin α,β = 2.0, 1.0 # Test matrix multiplication - N = 3 - a = @MMatrix rand(N,N) - b = @MMatrix rand(N,N) - c = @MMatrix rand(N,N) + N1,N2 = 3,2 + a = @MMatrix rand(N1,N2) + b = @MMatrix rand(N2,N2) + c = @MMatrix rand(N1,N2) test_muladd(a, b, c, α, β) test_muladd(a, b, c, 1, β) test_muladd(a, b, c, α, 2) test_muladd(a, b, c, α, Float32(3)) - N = 5 - a = @MMatrix rand(N,N) - b = @MMatrix rand(N,N) - c = @MMatrix rand(N,N) + N1,N2 = 5,6 + a = @MMatrix rand(N1,N2) + b = @MMatrix rand(N2,N2) + c = @MMatrix rand(N1,N2) test_muladd(a, b, c, α, β) - N = 15 - a = @MMatrix rand(N,N) - b = @MMatrix rand(N,N) - c = @MMatrix rand(N,N) + N1,N2 = 14,16 + a = @MMatrix rand(N1,N2) + b = @MMatrix rand(N2,N2) + c = @MMatrix rand(N1,N2) test_muladd(a, b, c, α, β) - N = 3 - a = SizedMatrix{N,N}(rand(N,N)) - b = SizedMatrix{N,N}(rand(N,N)) - c = SizedMatrix{N,N}(rand(N,N)) + N1,N2 = 3,2 + a = SizedMatrix{N1,N2}(rand(N1,N2)) + b = SizedMatrix{N2,N2}(rand(N2,N2)) + c = SizedMatrix{N1,N2}(rand(N1,N2)) test_muladd(a, b, c, α, β) - N = 5 - a = SizedMatrix{N,N}(rand(N,N)) - b = SizedMatrix{N,N}(rand(N,N)) - c = SizedMatrix{N,N}(rand(N,N)) + N1,N2 = 5,6 + a = SizedMatrix{N1,N2}(rand(N1,N2)) + b = SizedMatrix{N2,N2}(rand(N2,N2)) + c = SizedMatrix{N1,N2}(rand(N1,N2)) test_muladd(a, b, c, α, β) - N = 15 - a = SizedMatrix{N,N}(rand(N,N)) - b = SizedMatrix{N,N}(rand(N,N)) - c = SizedMatrix{N,N}(rand(N,N)) + N1,N2 = 14,16 + a = SizedMatrix{N1,N2}(rand(N1,N2)) + b = SizedMatrix{N2,N2}(rand(N2,N2)) + c = SizedMatrix{N1,N2}(rand(N1,N2)) test_muladd(a, b, c, α, β) # Test matrix-vector multiplication @@ -71,12 +91,13 @@ end a = @MVector rand(N) b = @MVector rand(N) c = @MMatrix rand(N,N) - test_muladd(a, b', c, α, β) + test_muladd(a,b',c,α,β, false) N = 15 a = rand(SizedVector{N}) b = rand(SizedVector{N}) c = rand(SizedMatrix{N,N}) + test_muladd(a,b',c,α,β, false) end #= From 35389df1ae5b0d66924b455410fa59199ce4deed Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Wed, 19 Feb 2020 15:25:52 -0800 Subject: [PATCH 3/8] Fix typo --- src/matrix_multiply.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index 98cbfb1b..b694cbe5 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -4,7 +4,7 @@ import LinearAlgebra: BlasFloat, matprod, mul! # Manage dispatch of * and mul! # TODO Adjoint? (Inner product?) -@inline *(A::StaticMatrix, B::AbwtractVector) = _mul(Size(A), A, B) +@inline *(A::StaticMatrix, B::AbstractVector) = _mul(Size(A), A, B) @inline *(A::StaticMatrix, B::StaticVector) = _mul(Size(A), Size(B), A, B) @inline *(A::StaticMatrix, B::StaticMatrix) = _mul(Size(A), Size(B), A, B) @inline *(A::StaticVector, B::StaticMatrix) = *(reshape(A, Size(Size(A)[1], 1)), B) From be0f54dbee5230acdb8b2bea3d12737002a79cb8 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Fri, 6 Mar 2020 09:42:30 -0800 Subject: [PATCH 4/8] Unify in-place multiplication methods --- Project.toml | 3 +- src/matrix_multiply.jl | 224 ++------------------ src/matrix_multiply_add.jl | 400 +++++++++++++++++++----------------- test/matrix_multiply_add.jl | 257 +++++++++++++---------- 4 files changed, 382 insertions(+), 502 deletions(-) diff --git a/Project.toml b/Project.toml index fc56beb8..2e977db4 100644 --- a/Project.toml +++ b/Project.toml @@ -13,6 +13,7 @@ julia = "1" [extras] InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf" [targets] -test = ["InteractiveUtils", "Test"] +test = ["InteractiveUtils", "Test", "BenchmarkTools"] diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index b694cbe5..ad82c56b 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -13,16 +13,15 @@ import LinearAlgebra: BlasFloat, matprod, mul! @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Adjoint{<:Any,<:StaticVector}) where {N} = vec(A) * B @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Transpose{<:Any,<:StaticVector}) where {N} = vec(A) * B -@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticVector) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticMatrix) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::StaticMatrix) = mul!(dest, reshape(A, Size(Size(A)[1], 1)), B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -#@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb}) -@inline mul!(dest::StaticMatrix, A::Transpose{<:Any,<:StaticMatrix}, B::StaticMatrix) = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, true, false) -@inline mul!(dest::StaticVector, A::Transpose{<:Any,<:StaticMatrix}, B::StaticVector) = - _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, true, false) +# @inline mul!(dest::StaticVecOrMat, A::StaticVecOrMat, B::StaticVecOrMat) = mul +# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::StaticMatrix) = mul!(dest, reshape(A, Size(Size(A)[1], 1)), B) +# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) +# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) +# #@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb}) +# @inline mul!(dest::StaticMatrix, A::Transpose{<:Any,<:StaticMatrix}, B::StaticMatrix) = +# _mul!(Size(dest), dest, Size(A), Size(B), A, B, true, false) +# @inline mul!(dest::StaticVector, A::Transpose{<:Any,<:StaticMatrix}, B::StaticVector) = +# _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, true, false) @@ -101,10 +100,14 @@ end # Heuristic choice between BLAS and explicit unrolling (or chunk-based unrolling) if sa[1]*sa[2]*sb[2] >= 14*14*14 + Sa = TSize{size(S),false}() + Sb = TSize{sa,false}() + Sc = TSize{sb,false}() + _add = MulAddMul(true,false) return quote @_inline_meta C = similar(a, T, $S) - mul_blas!($S, C, Sa, Sb, a, b) + mul_blas!($Sa, C, $Sa, $Sb, a, b, $_add) return C end elseif sa[1]*sa[2]*sb[2] < 8*8*8 @@ -214,203 +217,4 @@ end end end - -# TODO aliasing problems if c === b? -@generated function _mul!(::Size{sc}, c::StaticVector, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticVector) where {sa, sb, sc} - if sb[1] != sa[2] || sc[1] != sa[1] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - if sa[2] != 0 - exprs = [:(c[$k] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] - else - exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - return c - end -end - -@generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector, - b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}) where {sa, sb, sc} - if sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - exprs = [:(c[$(LinearIndices(sc)[i, j])] = a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - return c - end -end - -@generated function _mul!(Sc::Size{sc}, c::StaticMatrix{<:Any, <:Any, Tc}, Sa::Size{sa}, Sb::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticMatrix{<:Any, <:Any, Tb}) where {sa, sb, sc, Ta, Tb, Tc} - can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat - - if can_blas - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - mul_unrolled!(Sc, c, Sa, Sb, a, b) - return c - end - elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) - return quote - @_inline_meta - mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) - return c - end - else - return quote - @_inline_meta - mul_blas!(Sc, c, Sa, Sb, a, b) - return c - end - end - else - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - mul_unrolled!(Sc, c, Sa, Sb, a, b) - return c - end - else - return quote - @_inline_meta - mul_unrolled_chunks!(Sc, c, Sa, Sb, a, b) - return c - end - end - end -end - -@inline mul_blas!(Sc::Size{s}, c::SizedArray{<:Any,T}, - Sa::Size{sa}, Sb::Size{sb}, - a::SizedArray{<:Any,T}, b::SizedArray{<:Any,T}, - α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = - BLAS.gemm!('N','N', α, a.data, b.data, β , c.data) - -@inline mul_blas!(Sc::Size{s}, c::SizedArray{<:Any,T}, - Sa::Size{sa}, Sb::Size{sb}, - a::Transpose{<:Any, <:SizedArray}, b::SizedArray{<:Any,T}, - α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = - BLAS.gemm!('T','N', α, a.parent.data, b.data, β , c.data) - -@inline mul_blas!(Sc::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, - Sa::Size{sa}, Sb::Size{sb}, - a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix{<:Any, <:Any, T}, - α::Real=one(T), β::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} = - BLAS.gemm!('T','N', α, a.parent, b, β , c) - -@generated function mul_blas!(::Size{s}, c::StaticMatrix{<:Any, <:Any, T}, - ::Size{sa}, ::Size{sb}, - a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}, - alpha::Real=one(T), beta::Real=zero(T)) where {s,sa,sb, T <: BlasFloat} - if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s")) - end - - if sa[1] > 0 && sa[2] > 0 && sb[2] > 0 - # This code adapted from `gemm!()` in base/linalg/blas.jl - - if T == Float64 - gemm = :dgemm_ - elseif T == Float32 - gemm = :sgemm_ - elseif T == Complex{Float64} - gemm = :zgemm_ - else # T == Complex{Float32} - gemm = :cgemm_ - end - - blascall = quote - ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Nothing, - (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt}, - Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, - Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, - Ref{LinearAlgebra.BLAS.BlasInt}), - transA, transB, m, n, - ka, alpha, a, strideA, - b, strideB, beta, c, - strideC) - end - - return quote - transA = 'N' - transB = 'N' - m = $(sa[1]) - ka = $(sa[2]) - kb = $(sb[1]) - n = $(sb[2]) - strideA = $(sa[1]) - strideB = $(sb[1]) - strideC = $(s[1]) - - $blascall - - return c - end - else - throw(DimensionMismatch("Cannot call BLAS gemm with zero-dimension arrays, attempted $sa * $sb -> $s.")) - end -end - - -@generated function mul_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]]))) for k1 = 1:sa[1], k2 = 1:sb[2]] - else - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - end -end - -@generated function mul_unrolled_chunks!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticMatrix, b::StaticMatrix) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] - - # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster - tmp_type = SVector{sb[1], eltype(c)} - vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(Size(sa)), $(Size(sb[1])), a, $(Expr(:call, tmp_type, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] - - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] - - return quote - @_inline_meta - @inbounds $(Expr(:block, vect_exprs...)) - @inbounds $(Expr(:block, exprs...)) - end -end - -#function mul_blas(a, b, c, A, B) -#q -#end - -# The idea here is to get pointers to stack variables and call BLAS. -# This saves an aweful lot of time compared to copying SArray's to Ref{SArray{...}} -# and using BLAS should be fastest for (very) large SArrays - -# Here is an LLVM function that gets the pointer to its input, %x -# After this we would make the ccall above. # -# define i8* @f(i32 %x) #0 { -# %1 = alloca i32, align 4 -# store i32 %x, i32* %1, align 4 -# ret i32* %1 -# } diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl index 662013e0..18146cd1 100644 --- a/src/matrix_multiply_add.jl +++ b/src/matrix_multiply_add.jl @@ -1,263 +1,295 @@ +import LinearAlgebra.MulAddMul + +""" Size that stores whether a Matrix is a Transpose +Useful when selecting multiplication methods, and avoiding allocations when dealing with +the `Transpose` type by passing around the original matrix. +Should pair with `parent`. +""" +struct TSize{S,T} + function TSize{S,T}() where {S,T} + new{S::Tuple{Vararg{StaticDimension}},T::Bool}() + end +end +TSize(A::Type{<:Transpose{<:Any,<:StaticArray}}) = TSize{size(A),true}() +TSize(A::Type{<:Adjoint{<:Real,<:StaticArray}}) = TSize{size(A),true}() # can't handle complex adjoints yet +TSize(A::Type{<:StaticArray}) = TSize{size(A),false}() +TSize(A::StaticArrayLike) = TSize(typeof(A)) +istranpose(::TSize{<:Any,T}) where T = T +size(::TSize{S}) where S = S +Size(::TSize{S}) where S = Size{S}() +Base.transpose(::TSize{S,T}) where {S,T} = TSize{reverse(S),!T}() + +# Get the parent of transposed arrays, or the array itself if it has no parent +# QUESTION: maybe call this something else? +Base.parent(A::Union{<:Transpose{<:Any,<:StaticArray}, <:Adjoint{<:Any,<:StaticArray}}) = A.parent +Base.parent(A::StaticArray) = A # 5-argument matrix multiplication -@inline LinearAlgebra.mul!(dest::StaticVecOrMat, A::StaticMatrix, B::StaticVecOrMat, α::Real, β::Real) = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}, α::Real, β::Real) = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) -@inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}, α::Real, β::Real) = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) +# To avoid allocations, strip away Transpose type and store tranpose info in Size +@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike, B::StaticVecOrMatLike, + α::Real, β::Real) = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), MulAddMul(α,β)) -@inline mul!(dest::StaticVector, A::Transpose{<:Any, <:StaticMatrix}, B::StaticVector, α::Real, β::Real) = - _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, α, β) -@inline mul!(dest::StaticMatrix, A::Transpose{<:Any, <:StaticMatrix}, B::StaticMatrix, α::Real, β::Real) = - _mul!(Size(dest), dest, Size(A), Size(B), A, B, α, β) - -@inline multiplied_dimension(::Type{A}, ::Type{B}) where { - A<:Union{StaticMatrix,Transpose{<:Any,<:StaticMatrix}}, B<:StaticMatrix} = +"Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling." +@inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) = prod(size(A)) * size(B,2) -@inline multiplied_dimension(::Type{A}, ::Type{B}) where { - A<:Union{StaticMatrix,Transpose{<:Any,<:StaticMatrix}}, - B<:Transpose{<:Any, <:StaticMatrix}} = - prod(size(A)) * size(B,2) - -# Matrix-matrix multiplication -@generated function _mul!(Sc::Size{sc}, c::AbstractMatrix, - Sa::Size{sa}, Sb::Size{sb}, - a::AbstractMatrix, b::AbstractMatrix, - α::Real, β::Real) where {sa, sb, sc} - Ta,Tb,Tc = eltype(a), eltype(b), eltype(c) - can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat - - mult_dim = multiplied_dimension(a,b) - if sa[1] * sa[2] * sb[2] < 4*4*4 - return quote - @_inline_meta - muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) - return quote - @_inline_meta - muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - else - if can_blas - return quote - @_inline_meta - mul_blas!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - else - return quote - @_inline_meta - muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) - return c - end - end +""" Combine left and right sides of an assignment expression, short-cutting + lhs = α * rhs + β * lhs, + element-wise. +If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed. +""" +function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul{Tα,Tβ}}) where {Tα,Tβ} + @assert length(lhs) == length(rhs) + n = length(rhs) + if !Tα # not 1 + rhs = [:(α * $(expr)) for expr in rhs] + end + if !Tβ # not 0 + rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n] end + exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n] + return exprs end -# Matrix-vector multiplication -@generated function _mul!(::Size{sc}, c::StaticVecOrMat, ::Size{sa}, ::Size{sb}, - a::StaticMatrix, b::StaticVector, α::Real, β::Real, ::Val{col}=Val(1)) where {sa, sb, sc,col} +"Validate the dimensions of a matrix multiplication, including matrix-vector products" +function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc} if sb[1] != sa[2] || sc[1] != sa[1] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) + return false + elseif length(sc) == 2 || length(sb) == 2 + sc2 = length(sc) == 1 ? 1 : sc[2] + sb2 = length(sb) == 1 ? 1 : sb[2] + if sc2 != sb2 + return false + end end + return true +end - if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k,col])] = β * c[$(LinearIndices(sc)[k,col])] - + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), - [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] +"Obtain an expression for the linear index of var[k,j], taking transposes into account" +@inline _lind(A::Type{<:TSize}, k::Int, j::Int) = _lind(:a, A, k, j) +function _lind(var::Symbol, A::Type{TSize{sa,tA}}, k::Int, j::Int) where {sa,tA} + if tA + return :($var[$(LinearIndices(reverse(sa))[j, k])]) else - exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) - return c + return :($var[$(LinearIndices(sa)[k, j])]) end end -@generated function _tmul!(::Size{sc}, c::StaticVecOrMat, ::Size{sa}, ::Size{sb}, - a::StaticMatrix, b::StaticVector, α::Real, β::Real, ::Val{col}=Val(1)) where {sa, sb, sc,col} - if sb[1] != sa[1] || sc[1] != sa[2] +# Matrix-vector multiplication +@generated function _mul!(Sc::TSize{sc}, c::StaticVecOrMatLike, Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrix, b::StaticVector, _add::MulAddMul, + ::Val{col}=Val(1)) where {sa, sb, sc, col} + if sa[2] != sb[1] || sc[1] != sa[1] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k,col])] = β * c[$(LinearIndices(sc)[k,col])] - + α * $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), - [:(a[$(LinearIndices(sa)[j, k])]*b[$j]) for j = 1:sa[1]]))) for k = 1:sa[2]] + lhs = [:($(_lind(:c,Sc,k,col))) for k = 1:sa[1]] + ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:($(_lind(Sa,k,j))*b[$j]) for j = 1:sa[2]]))) for k = 1:sa[1]] + exprs = _muladd_expr(lhs, ab, _add) else exprs = [:(c[$k] = zero(eltype(c))) for k = 1:sa[1]] end return quote - @_inline_meta + # @_inline_meta + α = _add.alpha + β = _add.beta @inbounds $(Expr(:block, exprs...)) return c end end # Outer product -@generated function _mul!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, a::StaticVector, - b::Union{Transpose{<:Any, <:StaticVector}, Adjoint{<:Any, <:StaticVector}}, - α::Real, β::Real) where {sa, sb, sc} - if sa[1] != sc[1] || sb[2] != sc[2] +@generated function _mul!(::TSize{sc}, c::StaticMatrix, ::TSize{sa,false}, ::TSize{sb,true}, a::StaticVector, + b::StaticVector, + _add::MulAddMul) where {sa, sb, sc} + if sc[1] != sa[1] || sc[2] != sb[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end - exprs = [:(c[$(LinearIndices(sc)[i, j])] = β * c[$(LinearIndices(sc)[i, j])] + α * - a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] + lhs = [:(c[$(LinearIndices(sc)[i,j])]) for i = 1:sa[1], j = 1:sb[2]] + ab = [:(a[$i] * b[$j]) for i = 1:sa[1], j = 1:sb[2]] + exprs = _muladd_expr(lhs, ab, _add) return quote @_inline_meta + α = _add.alpha + β = _add.beta @inbounds $(Expr(:block, exprs...)) return c end end +# Matrix-matrix multiplication +@generated function _mul!(Sc::TSize{sc}, c::StaticMatrixLike, + Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrixLike, b::StaticMatrixLike, + _add::MulAddMul) where {sa, sb, sc} + Ta,Tb,Tc = eltype(a), eltype(b), eltype(c) + can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat -@inline muladd_unrolled!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, - a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = - _muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) - -@inline muladd_unrolled!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, - a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = - _tmuladd_unrolled!(Sc, c, Size(a.parent), Sb, a.parent, b, α, β) - -@generated function _muladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, - a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) - end - - if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * - $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), - [:(a[$(LinearIndices(sa)[k1, j])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[2]] - ))) for k1 = 1:sa[1], k2 = 1:sb[2]] + mult_dim = multiplied_dimension(a,b) + if mult_dim < 4*4*4 + return quote + @_inline_meta + muladd_unrolled_all!(Sc, c, Sa, Sb, a, b, _add) + # return c + end + elseif mult_dim < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add) + # return c + end else - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] - end - - return quote - @_inline_meta - @inbounds $(Expr(:block, exprs...)) + if can_blas + return quote + @_inline_meta + mul_blas!(Sc, c, Sa, Sb, a, b, _add) + return c + end + else + return quote + @_inline_meta + muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add) + return c + end + end end end -@generated function _tmuladd_unrolled!(::Size{sc}, c::StaticMatrix, ::Size{sa}, ::Size{sb}, - a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} - if sb[1] != sa[1] || sa[2] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $(reverse(sa)) and $sb and assign to array of size $sc")) + +@generated function muladd_unrolled_all!(Sc::TSize{sc}, c::StaticMatrixLike, Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrixLike, b::StaticMatrixLike, _add::MulAddMul) where {sa, sb, sc} + if !check_dims(Size(sc),Size(sa),Size(sb)) + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end if sa[2] != 0 - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = β*c[$(LinearIndices(sc)[k1, k2])] + α * - $(reduce((ex1,ex2) -> :(+($ex1,$ex2)), - [:(a[$(LinearIndices(sa)[j, k1])]*b[$(LinearIndices(sb)[j, k2])]) for j = 1:sa[1]] - ))) for k1 = 1:sa[2], k2 = 1:sb[2]] - else - exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = zero(eltype(c))) for k1 = 1:sa[1], k2 = 1:sb[2]] + lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]] + ab = [:($(reduce((ex1,ex2) -> :(+($ex1,$ex2)), + [:($(_lind(:a, Sa, k1, j)) * $(_lind(:b, Sb, j, k2))) for j = 1:sa[2]] + ))) for k1 = 1:sa[1], k2 = 1:sb[2]] + exprs = _muladd_expr(lhs, ab, _add) end return quote @_inline_meta + α = _add.alpha + β = _add.beta @inbounds $(Expr(:block, exprs...)) end end -@inline muladd_unrolled_chunks!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, - a::StaticMatrix, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = - _muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) - -@inline muladd_unrolled_chunks!(Sc::Size{sc}, c::StaticMatrix, Sa::Size{sa}, Sb::Size{sb}, - a::Transpose{<:Any, <:StaticMatrix}, b::StaticMatrix, α::Real, β::Real) where {sa, sb, sc} = - _tmuladd_unrolled_chunks!(Sc, c, Size(a.parent), Sb, a.parent, b, α, β) - -@generated function _muladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, - ::Size{sa}, ::Size{sb}, - a::Union{StaticMatrix, Transpose{<:Any, <:StaticMatrix}}, b::StaticMatrix, - α::Real, β::Real) where {sa, sb, sc} - if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] +@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, c::StaticMatrix, + Sa::TSize{sa}, Sb::TSize{sb}, + a::StaticMatrixLike, b::StaticMatrixLike, + _add::MulAddMul) where {sa, sb, sc} + if !check_dims(Size(sc),Size(sa),Size(sb)) throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end - #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] - # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster tmp_type = SVector{sb[1], eltype(c)} - col_mult = [:($(Symbol("tmp_$k2")) = - _mul!($(Size(sc)), c, $(Size(sa)), $(Size(sb[1])), a, + col_mult = [:( + _mul!(Sc, c, Sa, Sb, a, $(Expr(:call, tmp_type, - [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)),α,β,Val($k2))) for k2 = 1:sb[2]] + [:($(_lind(:b, Sb, i, k2))) for i = 1:sb[1]]...)),_add,Val($k2))) for k2 = 1:sb[2]] return quote - @_inline_meta + α = _add.alpha + β = _add.beta return $(Expr(:block, col_mult...)) end end -@generated function _tmuladd_unrolled_chunks!(::Size{sc}, c::StaticMatrix, - ::Size{sa}, ::Size{sb}, - a::Union{StaticMatrix, Transpose{<:Any, <:StaticMatrix}}, b::StaticMatrix, - α::Real, β::Real) where {sa, sb, sc} - if sb[1] != sa[1] || sa[2] != sc[1] || sb[2] != sc[2] - throw(DimensionMismatch("Tried to multiply arrays of size $(reverse(sa)) and $sb and assign to array of size $sc")) - end - - #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] - - # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster - tmp_type = SVector{sb[1], eltype(c)} +@inline _get_raw_data(A::SizedArray) = A.data +@inline _get_raw_data(A::StaticArray) = A + +function mul_blas!(::TSize{<:Any,false}, c::StaticMatrix, ::TSize{<:Any,tA}, ::TSize{<:Any,tB}, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {tA,tB} + mat_char(tA) = tA ? 'T' : 'N' + T = eltype(a) + A = _get_raw_data(a) + B = _get_raw_data(b) + C = _get_raw_data(c) + BLAS.gemm!(mat_char(tA), mat_char(tB), T(_add.alpha), A, B, T(_add.beta), C) +end - col_mult = [:($(Symbol("tmp_$k2")) = - _tmul!($(Size(sc)), c, $(Size(sa)), $(Size(sb[1])), a, - $(Expr(:call, tmp_type, - [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)),α,β,Val($k2))) for k2 = 1:sb[2]] +# if C is transposed, transpose the entire expression +@inline mul_blas!(Sc::TSize{<:Any,true}, c::StaticMatrix, Sa::TSize, Sb::TSize, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) = + mul_blas!(transpose(Sc), c, transpose(Sb), transpose(Sa), b, a, _add) - return quote - @_inline_meta - return $(Expr(:block, col_mult...)) - end -end +# TODO: Get this version of mul_blas! working so there's backward-compatibility with older Julia versions -# Special-case SizedMatrix -# @inline mul!(dest::SizedMatrix{<:Any, <:Any, Tc}, A::SizedMatrix{<:Any, <:Any, Ta}, -# B::SizedMatrix{<:Any, <:Any, Tb}) where {Ta,Tb,Tc} = -# _mul!(Size(dest), dest, Size(A), Size(B), A, B, one(Ta), zero(Ta)) +# @generated function mul_blas!(::TSize{s,false}, c::StaticMatrix{<:Any, <:Any, T}, +# ::TSize{sa,tA}, ::TSize{sb,tB}, +# a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}, +# _add::MulAddMul = MulAddMul()) where {s,sa,sb, T <: BlasFloat, tA, tB} +# if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2] +# throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s")) +# end +# +# if sa[1] > 0 && sa[2] > 0 && sb[2] > 0 +# # This code adapted from `gemm!()` in base/linalg/blas.jl +# +# if T == Float64 +# gemm = :dgemm_ +# elseif T == Float32 +# gemm = :sgemm_ +# elseif T == Complex{Float64} +# gemm = :zgemm_ +# else # T == Complex{Float32} +# gemm = :cgemm_ +# end +# +# mat_char(tA) = tA ? 'T' : 'N' +# transA = mat_char(tA) +# transB = mat_char(tB) +# +# m = sa[tA ? 2 : 1] +# ka = sa[tA ? 1 : 2] +# kb = sb[tB ? 2 : 1] +# n = sb[tB ? 1 : 2] +# +# blascall = quote +# ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Nothing, +# (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt}, +# Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, +# Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, +# Ref{LinearAlgebra.BLAS.BlasInt}), +# $transA, $transB, $m, $n, +# $ka, alpha, A, strideA, +# B, strideB, beta, C, +# strideC) +# end +# +# return quote +# println($transA) +# println($transB) +# alpha = _add.alpha +# beta = _add.beta +# # m = $(sa[1]) +# # ka = $(sa[2]) +# # kb = $(sb[1]) +# # n = $(sb[2]) +# strideA = $m #$(sa[1]) +# strideB = $kb #$(sb[1]) +# strideC = $(s[1]) +# A = _get_raw_data(a) +# B = _get_raw_data(b) +# C = _get_raw_data(c) # -# @generated function _mul!(Sc::Size{sc}, c::SizedMatrix{<:Any, <:Any, Tc}, -# Sa::Size{sa}, Sb::Size{sb}, -# a::SizedMatrix{<:Any, <:Any, Ta}, b::SizedMatrix{<:Any, <:Any, Tb}, -# α::Real, β::Real) where {sa, sb, sc, Ta, Tb, Tc} -# can_blas = Tc == Ta && Tc == Tb && Tc <: BlasFloat +# $blascall # -# if can_blas -# if sa[1] * sa[2] * sb[2] < 4*4*4 -# return quote -# @_inline_meta -# muladd_unrolled!(Sc, c, Sa, Sb, a, b, α, β) -# return c -# end -# elseif sa[1] * sa[2] * sb[2] < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) -# return quote -# @_inline_meta -# muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, α, β) -# return c -# end -# else -# return quote -# # @_inline_meta -# BLAS.gemm!('N','N', α, a.data, b.data, β, c.data) -# return c -# end +# return c # end +# else +# throw(DimensionMismatch("Cannot call BLAS gemm with zero-dimension arrays, attempted $sa * $sb -> $s.")) # end # end diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index f9bdb8b5..5fa84fdf 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -1,111 +1,154 @@ - -function test_muladd(a,b,c, α, β, test_transpose=true) - c0 = copy(c) - b0 = copy(b) - - mul!(c,a,b) - @test c ≈ a*b - - c .= c0 - b .= b0 - mul!(c,a,b,α,β) - @test c ≈ a*b*α + c0*β - - @test (@allocated mul!(c,a,b)) == 0 - @test (@allocated mul!(c,a,b,α,β)) == 0 - - if test_transpose - c .= c0 - b .= b0 - mul!(b, Transpose(a), c) - @test b ≈ a'c - - c .= c0 - b .= b0 - mul!(b,Transpose(a),c,α,β) - @test b ≈ a'c*α + b0*β - - @test (@allocated mul!(b,Transpose(a),c)) == 0 - @test (@allocated mul!(b,Transpose(a),c,α,β)) == 0 +using Test +using StaticArrays +using LinearAlgebra +using BenchmarkTools + +# check_dims +@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,)) +@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(4,)) +@test !StaticArrays.check_dims(Size(4,), Size(3,4), Size(4,)) + +@test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,1)) +@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,)) +@test StaticArrays.check_dims(Size(4,1), Size(4,3), Size(3,1)) +@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,)) +@test !StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,2)) +@test StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,2)) +@test !StaticArrays.check_dims(Size(4,2), Size(4,3), Size(3,3)) + +function test_multiply_add(N1,N2,ArrayType=MArray) + if ArrayType <: MArray + Mat = MMatrix + Vec = MVector + elseif ArrayType <: SizedArray + Mat = SizedMatrix + Vec = SizedVector end + A = rand(Mat{N1,N2}) + At = Transpose(A) + b = rand(Vec{N2}) + c = rand(Vec{N1}) + + # Parent + @test parent(A) === A + @test parent(At) === A + @test size(parent(At)) == (N1,N2) + @test parent(b') === b + @test size(parent(b')) == (N2,) + + # TSize + ta = StaticArrays.TSize(A) + @test !StaticArrays.istranpose(ta) + @test size(ta) == (N1,N2) + @test Size(ta) == Size(N1,N2) + ta = StaticArrays.TSize(At) + @test StaticArrays.istranpose(ta) + @test size(ta) == (N2,N1) + @test Size(ta) == Size(N2,N1) + tb = StaticArrays.TSize(b') + @test StaticArrays.istranpose(tb) + ta = transpose(ta) + @test !StaticArrays.istranpose(ta) + @test size(ta) == (N1,N2) + @test Size(ta) == Size(N1,N2) + + # A*b + mul!(c,A,b) + @test c ≈ A*b + mul!(c,A,b,1.0,0.0) + @test c ≈ A*b + mul!(c,A,b,1.0,1.0) + @test c ≈ 2A*b + + # matrix-transpose × vector + mul!(b,At,c) + @test b ≈ A'c + mul!(b,At,c,2.0,0.0) + @test b ≈ 2A'c + mul!(b,At,c,1.0,2.0) + @test b ≈ 5A'c + + bmark = @benchmark mul!($c,$A,$b) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + b = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 + @test minimum(b).allocs == 0 + + # outer product + C = rand(Mat{N1,N2}) + a = rand(Vec{N1}) + b = rand(Vec{N2}) + mul!(C,a,b') + @test C ≈ a*b' + mul!(C,a,b',2.,0.) + @test C ≈ 2a*b' + mul!(C,a,b',1.,1.) + @test C ≈ 3a*b' + + b = @benchmark mul!($C,$a,$b') samples=10 evals=10 + @test minimum(b).allocs == 0 + + # A × B + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + mul!(C,A,B) + @test C ≈ A*B + mul!(C,A,B,2.0,0.0) + @test C ≈ 2A*B + mul!(C,A,B,2.0,1.0) + @test C ≈ 4A*B + + b = @benchmark mul!($C,$A,$B,1,0) samples=10 evals=10 + @test minimum(b).allocs == 0 + + # A'B + At = Transpose(A) + mul!(B,At,C) + @test B ≈ A'C + mul!(B,At,C,2.0,0.0) + @test B ≈ 2A'C + mul!(B,At,C,2.0,1.0) + @test B ≈ 4A'C + + b = @benchmark mul!($B,Transpose($A),$C) samples=10 evals=10 + @test minimum(b).allocs == 0 + + # A*B' + Bt = Transpose(B) + mul!(C,A,Bt) + @test C ≈ A*B' + mul!(C,A,Bt,2.0,0.0) + @test C ≈ 2A*B' + mul!(C,A,Bt,2.0,1.0) + @test C ≈ 4A*B' + + b = @benchmark mul!($C,$A,Transpose($B),1,2) samples=10 evals=10 + @test minimum(b).allocs == 0 + + # A'B' + B = rand(Mat{N1,N1}) + C = rand(Mat{N2,N1}) + mul!(C,Transpose(A),Transpose(B)) + @test C ≈ A'B' + mul!(C,Transpose(A),Transpose(B),2.0,0.0) + @test C ≈ 2A'B' + mul!(C,Transpose(A),Transpose(B),2.0,1.0) + @test C ≈ 4A'B' + + b = @benchmark mul!($C,Transpose($A),Transpose($B),1.0,2) samples=10 evals=10 + @test minimum(b).allocs == 0 + + # Transpose Output + C = rand(Mat{N1,N2}) + mul!(Transpose(C),Transpose(A),Transpose(B)) + @test C' ≈ A'B' + b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),1.0,2) samples=10 evals=10 + @test minimum(b).allocs == 0 end -@testset "Matrix multiply-add" begin - α,β = 2.0, 1.0 - - # Test matrix multiplication - N1,N2 = 3,2 - a = @MMatrix rand(N1,N2) - b = @MMatrix rand(N2,N2) - c = @MMatrix rand(N1,N2) - test_muladd(a, b, c, α, β) - test_muladd(a, b, c, 1, β) - test_muladd(a, b, c, α, 2) - test_muladd(a, b, c, α, Float32(3)) - - N1,N2 = 5,6 - a = @MMatrix rand(N1,N2) - b = @MMatrix rand(N2,N2) - c = @MMatrix rand(N1,N2) - test_muladd(a, b, c, α, β) - - N1,N2 = 14,16 - a = @MMatrix rand(N1,N2) - b = @MMatrix rand(N2,N2) - c = @MMatrix rand(N1,N2) - test_muladd(a, b, c, α, β) - - N1,N2 = 3,2 - a = SizedMatrix{N1,N2}(rand(N1,N2)) - b = SizedMatrix{N2,N2}(rand(N2,N2)) - c = SizedMatrix{N1,N2}(rand(N1,N2)) - test_muladd(a, b, c, α, β) - - N1,N2 = 5,6 - a = SizedMatrix{N1,N2}(rand(N1,N2)) - b = SizedMatrix{N2,N2}(rand(N2,N2)) - c = SizedMatrix{N1,N2}(rand(N1,N2)) - test_muladd(a, b, c, α, β) - - N1,N2 = 14,16 - a = SizedMatrix{N1,N2}(rand(N1,N2)) - b = SizedMatrix{N2,N2}(rand(N2,N2)) - c = SizedMatrix{N1,N2}(rand(N1,N2)) - test_muladd(a, b, c, α, β) - - # Test matrix-vector multiplication - N = 15 - a = @MMatrix rand(N,N) - b = @MVector rand(N) - c = @MVector rand(N) - test_muladd(a,b,c,α,β) - - N = 15 - a = rand(SizedMatrix{N,N}) - b = rand(SizedVector{N}) - c = rand(SizedVector{N}) - test_muladd(a,b,c,α,β) - - # Test outer products - N = 5 - a = @MVector rand(N) - b = @MVector rand(N) - c = @MMatrix rand(N,N) - test_muladd(a,b',c,α,β, false) - - N = 15 - a = rand(SizedVector{N}) - b = rand(SizedVector{N}) - c = rand(SizedMatrix{N,N}) - test_muladd(a,b',c,α,β, false) +# Test the three different +@testset "matrix multiply-add" begin + test_multiply_add(3,4) + test_multiply_add(5,6) + test_multiply_add(15,16) end - -#= -Summary: -All MMatrix sizes run without allocations -All MMatrix sizes have low compile times - -Slow compilation: - MMatrix initialization - MMatrix multiplication (not in place) -=# From 027ad4960d21959479bea689eecb9f7757dc14f6 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Sat, 7 Mar 2020 09:24:19 -0800 Subject: [PATCH 5/8] Hard-code MulAddMul and revert chunked muladd --- src/StaticArrays.jl | 2 +- src/matrix_multiply.jl | 30 +------------------ src/matrix_multiply_add.jl | 57 ++++++++++++++++++++++++++++--------- test/matrix_multiply_add.jl | 47 ++++++++++++++++++++++++------ 4 files changed, 85 insertions(+), 51 deletions(-) diff --git a/src/StaticArrays.jl b/src/StaticArrays.jl index e2027829..50983b89 100644 --- a/src/StaticArrays.jl +++ b/src/StaticArrays.jl @@ -125,8 +125,8 @@ include("broadcast.jl") include("mapreduce.jl") include("arraymath.jl") include("linalg.jl") -include("matrix_multiply.jl") include("matrix_multiply_add.jl") +include("matrix_multiply.jl") include("det.jl") include("inv.jl") include("solve.jl") diff --git a/src/matrix_multiply.jl b/src/matrix_multiply.jl index ad82c56b..b9bbb284 100644 --- a/src/matrix_multiply.jl +++ b/src/matrix_multiply.jl @@ -13,17 +13,6 @@ import LinearAlgebra: BlasFloat, matprod, mul! @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Adjoint{<:Any,<:StaticVector}) where {N} = vec(A) * B @inline *(A::StaticArray{Tuple{N,1},<:Any,2}, B::Transpose{<:Any,<:StaticVector}) where {N} = vec(A) * B -# @inline mul!(dest::StaticVecOrMat, A::StaticVecOrMat, B::StaticVecOrMat) = mul -# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::StaticMatrix) = mul!(dest, reshape(A, Size(Size(A)[1], 1)), B) -# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Transpose{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -# @inline mul!(dest::StaticVecOrMat, A::StaticVector, B::Adjoint{<:Any, <:StaticVector}) = _mul!(Size(dest), dest, Size(A), Size(B), A, B) -# #@inline *{TA<:LinearAlgebra.BlasFloat,Tb}(A::StaticMatrix{TA}, b::StaticVector{Tb}) -# @inline mul!(dest::StaticMatrix, A::Transpose{<:Any,<:StaticMatrix}, B::StaticMatrix) = -# _mul!(Size(dest), dest, Size(A), Size(B), A, B, true, false) -# @inline mul!(dest::StaticVector, A::Transpose{<:Any,<:StaticMatrix}, B::StaticVector) = -# _tmul!(Size(dest), dest, Size(A.parent), Size(B), A.parent, B, true, false) - - # Implementations @@ -184,7 +173,7 @@ end # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than (possibly) a mutable type. Avoids allocation == faster tmp_type_in = :(SVector{$(sb[1]), T}) tmp_type_out = :(SVector{$(sa[1]), T}) - vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(Size(a), Size($(sb[1])), a, + vect_exprs = [:($(Symbol("tmp_$k2"))::$tmp_type_out = partly_unrolled_multiply(TSize(a), TSize($(sb[1])), a, $(Expr(:call, tmp_type_in, [Expr(:ref, :b, LinearIndices(sb)[i, k2]) for i = 1:sb[1]]...)))::$tmp_type_out) for k2 = 1:sb[2]] @@ -200,21 +189,4 @@ end end end -@generated function partly_unrolled_multiply(::Size{sa}, ::Size{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb} - if sa[2] != sb[1] - throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb")) - end - - if sa[2] != 0 - exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:(a[$(LinearIndices(sa)[k, j])]*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]] - else - exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]] - end - - return quote - $(Expr(:meta,:noinline)) - @inbounds return SVector(tuple($(exprs...))) - end -end - # diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl index 18146cd1..8f3f8ac1 100644 --- a/src/matrix_multiply_add.jl +++ b/src/matrix_multiply_add.jl @@ -14,6 +14,8 @@ TSize(A::Type{<:Transpose{<:Any,<:StaticArray}}) = TSize{size(A),true}() TSize(A::Type{<:Adjoint{<:Real,<:StaticArray}}) = TSize{size(A),true}() # can't handle complex adjoints yet TSize(A::Type{<:StaticArray}) = TSize{size(A),false}() TSize(A::StaticArrayLike) = TSize(typeof(A)) +TSize(S::Size{s}, T=false) where s = TSize{s,T}() +TSize(s::Number) = TSize(Size(s)) istranpose(::TSize{<:Any,T}) where T = T size(::TSize{S}) where S = S Size(::TSize{S}) where S = Size{S}() @@ -27,7 +29,13 @@ Base.parent(A::StaticArray) = A # 5-argument matrix multiplication # To avoid allocations, strip away Transpose type and store tranpose info in Size @inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike, B::StaticVecOrMatLike, - α::Real, β::Real) = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), MulAddMul(α,β)) + α::Ta, β::Tb) where {Ta<:Real,Tb<:Real} = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), + MulAddMul{false,false,Ta,Tb}(α,β)) + +@inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike{T}, + B::StaticVecOrMatLike{T}) where T = + _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), MulAddMul{true,true,T,T}(one(T),zero(T))) + "Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling." @inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) = @@ -135,13 +143,13 @@ end return quote @_inline_meta muladd_unrolled_all!(Sc, c, Sa, Sb, a, b, _add) - # return c + return c end elseif mult_dim < 14*14*14 # Something seems broken for this one with large matrices (becomes allocating) return quote @_inline_meta muladd_unrolled_chunks!(Sc, c, Sa, Sb, a, b, _add) - # return c + return c end else if can_blas @@ -184,26 +192,49 @@ end end -@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, c::StaticMatrix, - Sa::TSize{sa}, Sb::TSize{sb}, - a::StaticMatrixLike, b::StaticMatrixLike, - _add::MulAddMul) where {sa, sb, sc} - if !check_dims(Size(sc),Size(sa),Size(sb)) +@generated function muladd_unrolled_chunks!(Sc::TSize{sc}, c::StaticMatrix, ::TSize{sa,tA}, Sb::TSize{sb,tB}, + a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) where {sa, sb, sc, tA, tB} + if sb[1] != sa[2] || sa[1] != sc[1] || sb[2] != sc[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end + #vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply(A, B[:, $k2])) for k2 = 1:sB[2]] + # Do a custom b[:, k2] to return a SVector (an isbitstype type) rather than a mutable type. Avoids allocation == faster tmp_type = SVector{sb[1], eltype(c)} + vect_exprs = [:($(Symbol("tmp_$k2")) = partly_unrolled_multiply($(TSize{sa,tA}()), $(TSize{(sb[1],),tB}()), + a, $(Expr(:call, tmp_type, [:($(_lind(:b, Sb, i, k2))) for i = 1:sb[1]]...)))) for k2 = 1:sb[2]] - col_mult = [:( - _mul!(Sc, c, Sa, Sb, a, - $(Expr(:call, tmp_type, - [:($(_lind(:b, Sb, i, k2))) for i = 1:sb[1]]...)),_add,Val($k2))) for k2 = 1:sb[2]] + lhs = [:($(_lind(:c, Sc, k1, k2))) for k1 = 1:sa[1], k2 = 1:sb[2]] + # exprs = [:(c[$(LinearIndices(sc)[k1, k2])] = $(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + rhs = [:($(Symbol("tmp_$k2"))[$k1]) for k1 = 1:sa[1], k2 = 1:sb[2]] + exprs = _muladd_expr(lhs, rhs, _add) return quote + @_inline_meta α = _add.alpha β = _add.beta - return $(Expr(:block, col_mult...)) + @inbounds $(Expr(:block, vect_exprs...)) + @inbounds $(Expr(:block, exprs...)) + end +end + +# @inline partly_unrolled_multiply(Sa::Size, Sb::Size, a::StaticMatrix, b::StaticArray) where {sa, sb, Ta, Tb} = +# partly_unrolled_multiply(TSize(Sa), TSize(Sb), a, b) +@generated function partly_unrolled_multiply(Sa::TSize{sa}, ::TSize{sb}, a::StaticMatrix{<:Any, <:Any, Ta}, b::StaticArray{<:Tuple, Tb}) where {sa, sb, Ta, Tb} + if sa[2] != sb[1] + throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb")) + end + + if sa[2] != 0 + exprs = [reduce((ex1,ex2) -> :(+($ex1,$ex2)), [:($(_lind(:a,Sa,k,j))*b[$j]) for j = 1:sa[2]]) for k = 1:sa[1]] + else + exprs = [:(zero(promote_op(matprod,Ta,Tb))) for k = 1:sa[1]] + end + + return quote + $(Expr(:meta,:noinline)) + @inbounds return SVector(tuple($(exprs...))) end end diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index 5fa84fdf..df20160f 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -1,7 +1,29 @@ -using Test using StaticArrays using LinearAlgebra using BenchmarkTools +using Test +# function benchmark_matmul(N1,N2,ArrayType=MArray) +# if ArrayType <: MArray +# Mat = MMatrix +# Vec = MVector +# elseif ArrayType <: SizedArray +# Mat = SizedMatrix +# Vec = SizedVector +# end +# α,β = 1.0, 1.0 +# A = rand(Mat{N1,N2}) +# B = rand(Mat{N2,N2}) +# C = rand(Mat{N1,N2}) +# println("C = A*B") +# @btime mul!($C,$A,$B) +# println("C = A*B + C") +# @btime mul!($C,$A,$B,$α,$β) +# println("B = A'C") +# @btime mul!($B,Transpose($A),$C) +# println("B = A'C + B") +# @btime mul!($B,Transpose($A),$C,$α,$β) +# end +# benchmark_matmul(20,20,SizedArray) # check_dims @test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,)) @@ -24,6 +46,8 @@ function test_multiply_add(N1,N2,ArrayType=MArray) Mat = SizedMatrix Vec = SizedVector end + α,β = 2.0, 1.0 + A = rand(Mat{N1,N2}) At = Transpose(A) b = rand(Vec{N2}) @@ -70,8 +94,12 @@ function test_multiply_add(N1,N2,ArrayType=MArray) bmark = @benchmark mul!($c,$A,$b) samples=10 evals=10 @test minimum(bmark).allocs == 0 - b = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 - @test minimum(b).allocs == 0 + bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 + @test minimum(bmark).allocs == 0 + bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10 + @test minimum(bmark).allocs == 0 # outer product C = rand(Mat{N1,N2}) @@ -98,7 +126,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,A,B,2.0,1.0) @test C ≈ 4A*B - b = @benchmark mul!($C,$A,$B,1,0) samples=10 evals=10 + b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 # A'B @@ -110,7 +138,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(B,At,C,2.0,1.0) @test B ≈ 4A'C - b = @benchmark mul!($B,Transpose($A),$C) samples=10 evals=10 + b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 # A*B' @@ -122,7 +150,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,A,Bt,2.0,1.0) @test C ≈ 4A*B' - b = @benchmark mul!($C,$A,Transpose($B),1,2) samples=10 evals=10 + b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 # A'B' @@ -135,14 +163,14 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(C,Transpose(A),Transpose(B),2.0,1.0) @test C ≈ 4A'B' - b = @benchmark mul!($C,Transpose($A),Transpose($B),1.0,2) samples=10 evals=10 + b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 # Transpose Output C = rand(Mat{N1,N2}) mul!(Transpose(C),Transpose(A),Transpose(B)) @test C' ≈ A'B' - b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),1.0,2) samples=10 evals=10 + b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 end @@ -151,4 +179,7 @@ end test_multiply_add(3,4) test_multiply_add(5,6) test_multiply_add(15,16) + test_multiply_add(3,4,SizedArray) + test_multiply_add(5,6,SizedArray) + test_multiply_add(15,16,SizedArray) end From fe44f2d868547201c1591d07cd36afd06053c9c3 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Mon, 23 Mar 2020 21:56:40 -0700 Subject: [PATCH 6/8] Don't use LinearAlgebra MulAddMul --- src/matrix_multiply_add.jl | 94 ++++++++++++++++++++++++-------------- 1 file changed, 60 insertions(+), 34 deletions(-) diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl index 8f3f8ac1..27a42226 100644 --- a/src/matrix_multiply_add.jl +++ b/src/matrix_multiply_add.jl @@ -1,4 +1,21 @@ -import LinearAlgebra.MulAddMul +# import LinearAlgebra.MulAddMul + +abstract type MulAddMul{T} end + +struct AlphaBeta{T} <: MulAddMul{T} + α::T + β::T + function AlphaBeta{T}(α,β) where T <: Real + new{T}(α,β) + end +end +@inline AlphaBeta(α::A,β::B) where {A,B} = AlphaBeta{promote_type(A,B)}(α,β) +@inline alpha(ab::AlphaBeta) = ab.α +@inline beta(ab::AlphaBeta) = ab.β + +struct NoMulAdd{T} <: MulAddMul{T} end +@inline alpha(ma::NoMulAdd{T}) where T = one(T) +@inline beta(ma::NoMulAdd{T}) where T = zero(T) """ Size that stores whether a Matrix is a Transpose Useful when selecting multiplication methods, and avoiding allocations when dealing with @@ -29,36 +46,18 @@ Base.parent(A::StaticArray) = A # 5-argument matrix multiplication # To avoid allocations, strip away Transpose type and store tranpose info in Size @inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike, B::StaticVecOrMatLike, - α::Ta, β::Tb) where {Ta<:Real,Tb<:Real} = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), - MulAddMul{false,false,Ta,Tb}(α,β)) + α::Real, β::Real) = _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), + AlphaBeta(α,β)) @inline LinearAlgebra.mul!(dest::StaticVecOrMatLike, A::StaticVecOrMatLike{T}, B::StaticVecOrMatLike{T}) where T = - _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), MulAddMul{true,true,T,T}(one(T),zero(T))) + _mul!(TSize(dest), parent(dest), TSize(A), TSize(B), parent(A), parent(B), NoMulAdd{T}()) "Calculate the product of the dimensions being multiplied. Useful as a heuristic for unrolling." @inline multiplied_dimension(A::Type{<:StaticVecOrMatLike}, B::Type{<:StaticVecOrMatLike}) = prod(size(A)) * size(B,2) -""" Combine left and right sides of an assignment expression, short-cutting - lhs = α * rhs + β * lhs, - element-wise. -If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed. -""" -function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul{Tα,Tβ}}) where {Tα,Tβ} - @assert length(lhs) == length(rhs) - n = length(rhs) - if !Tα # not 1 - rhs = [:(α * $(expr)) for expr in rhs] - end - if !Tβ # not 0 - rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n] - end - exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n] - return exprs -end - "Validate the dimensions of a matrix multiplication, including matrix-vector products" function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc} if sb[1] != sa[2] || sc[1] != sa[1] @@ -73,6 +72,28 @@ function check_dims(::Size{sc}, ::Size{sa}, ::Size{sb}) where {sa,sb,sc} return true end +""" Combine left and right sides of an assignment expression, short-cutting + lhs = α * rhs + β * lhs, + element-wise. +If α = 1, the multiplication by α is removed. If β = 0, the second rhs term is removed. +""" +function _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:AlphaBeta}) + @assert length(lhs) == length(rhs) + n = length(rhs) + rhs = [:(α * $(expr)) for expr in rhs] + rhs = [:($(lhs[k]) * β + $(rhs[k])) for k = 1:n] + exprs = [:($(lhs[k]) = $(rhs[k])) for k = 1:n] + _assign(lhs, rhs) + return exprs +end + +@inline _muladd_expr(lhs::Array{Expr}, rhs::Array{Expr}, ::Type{<:MulAddMul}) = _assign(lhs, rhs) + +@inline function _assign(lhs::Array{Expr}, rhs::Array{Expr}) + @assert length(lhs) == length(rhs) + [:($(lhs[k]) = $(rhs[k])) for k = 1:length(lhs)] +end + "Obtain an expression for the linear index of var[k,j], taking transposes into account" @inline _lind(A::Type{<:TSize}, k::Int, j::Int) = _lind(:a, A, k, j) function _lind(var::Symbol, A::Type{TSize{sa,tA}}, k::Int, j::Int) where {sa,tA} @@ -102,17 +123,18 @@ end return quote # @_inline_meta - α = _add.alpha - β = _add.beta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) @inbounds $(Expr(:block, exprs...)) return c end end # Outer product -@generated function _mul!(::TSize{sc}, c::StaticMatrix, ::TSize{sa,false}, ::TSize{sb,true}, a::StaticVector, - b::StaticVector, - _add::MulAddMul) where {sa, sb, sc} +@generated function _mul!(::TSize{sc}, c::StaticMatrix, ::TSize{sa,false}, ::TSize{sb,true}, + a::StaticVector, b::StaticVector, _add::MulAddMul) where {sa, sb, sc} if sc[1] != sa[1] || sc[2] != sb[2] throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $sc")) end @@ -123,8 +145,8 @@ end return quote @_inline_meta - α = _add.alpha - β = _add.beta + α = alpha(_add) + β = beta(_add) @inbounds $(Expr(:block, exprs...)) return c end @@ -185,8 +207,10 @@ end return quote @_inline_meta - α = _add.alpha - β = _add.beta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) @inbounds $(Expr(:block, exprs...)) end end @@ -212,8 +236,10 @@ end return quote @_inline_meta - α = _add.alpha - β = _add.beta + # α = _add.alpha + # β = _add.beta + α = alpha(_add) + β = beta(_add) @inbounds $(Expr(:block, vect_exprs...)) @inbounds $(Expr(:block, exprs...)) end @@ -248,7 +274,7 @@ function mul_blas!(::TSize{<:Any,false}, c::StaticMatrix, ::TSize{<:Any,tA}, ::T A = _get_raw_data(a) B = _get_raw_data(b) C = _get_raw_data(c) - BLAS.gemm!(mat_char(tA), mat_char(tB), T(_add.alpha), A, B, T(_add.beta), C) + BLAS.gemm!(mat_char(tA), mat_char(tB), T(alpha(_add)), A, B, T(beta(_add)), C) end # if C is transposed, transpose the entire expression From 651ddba698940f3975741cd8fc39e23e8922b2ff Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Tue, 14 Apr 2020 09:58:25 -0700 Subject: [PATCH 7/8] Add multiply-add benchmarks --- benchmark/bench_matrix_ops.jl | 45 +++++++++++++++++++++++++++++++++++ test/matrix_multiply_add.jl | 12 ++++++++-- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/benchmark/bench_matrix_ops.jl b/benchmark/bench_matrix_ops.jl index 2795c1e2..e484aabd 100644 --- a/benchmark/bench_matrix_ops.jl +++ b/benchmark/bench_matrix_ops.jl @@ -41,5 +41,50 @@ for f in [*, \] end end +# Multiply-add +function benchmark_matmul(s,N1,N2,ArrayType) + if ArrayType <: MArray + Mat = MMatrix + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + label = "MMatrix" + elseif ArrayType <: SizedArray + Mat = SizedMatrix + A = rand(Mat{N1,N2}) + B = rand(Mat{N2,N2}) + C = rand(Mat{N1,N2}) + label = "SizedMatrix" + elseif ArrayType <: Array + A = rand(N1,N2) + B = rand(N2,N2) + C = rand(N1,N2) + label = "Matrix" + end + α,β = 1.0, 1.0 + s1 = s["mul!(C,A,B)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s2 = s["mul!(C,A,B,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s3 = s["mul!(B,A',C)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + s4 = s["mul!(B,A',C,α,β)"][string(N1, pad=2) * string(N2, pad=2)] = BenchmarkGroup() + + s1[label] = @benchmarkable mul!($C,$A,$B) + s2[label] = @benchmarkable mul!($C,$A,$B,$α,$β) + s3[label] = @benchmarkable mul!($B,Transpose($A),$C) + s4[label] = @benchmarkable mul!($B,Transpose($A),$C,$α,$β) +end + +begin + suite["mul!(C,A,B)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(C,A,B,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(B,A',C)"] = BenchmarkGroup(["inplace", "multiply-add"]) + suite["mul!(B,A',C,α,β)"] = BenchmarkGroup(["inplace", "multiply-add"]) + for N in matrix_sizes + for Mat in (MMatrix, SizedMatrix, Matrix) + benchmark_matmul(suite, N+1, N, Mat) + end + end +end + + end # module BenchMatrixOps.suite diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index df20160f..dccafb53 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -92,14 +92,16 @@ function test_multiply_add(N1,N2,ArrayType=MArray) mul!(b,At,c,1.0,2.0) @test b ≈ 5A'c - bmark = @benchmark mul!($c,$A,$b) samples=10 evals=10 - @test minimum(bmark).allocs == 0 + @test_noalloc mul!(c,A,b) bmark = @benchmark mul!($c,$A,$b,$α,$β) samples=10 evals=10 @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(c, A, b, α, β) # records 32 bytes bmark = @benchmark mul!($b,Transpose($A),$c) samples=10 evals=10 @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(b, Transpose(A), c) # records 16 bytes bmark = @benchmark mul!($b,Transpose($A),$c,$α,$β) samples=10 evals=10 @test minimum(bmark).allocs == 0 + # @test_noalloc mul!(b, Transpose(A), c, α, β) # records 48 bytes # outer product C = rand(Mat{N1,N2}) @@ -114,6 +116,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) b = @benchmark mul!($C,$a,$b') samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, a, b') # records 16 bytes # A × B A = rand(Mat{N1,N2}) @@ -128,6 +131,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) b = @benchmark mul!($C,$A,$B,$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, A, B, α, β) # records 32 bytes # A'B At = Transpose(A) @@ -140,6 +144,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) b = @benchmark mul!($B,Transpose($A),$C,$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(B, Transpose(A), C, α, β) # records 48 bytes # A*B' Bt = Transpose(B) @@ -152,6 +157,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) b = @benchmark mul!($C,$A,Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, A, Transpose(B), α, β) # records 48 bytes # A'B' B = rand(Mat{N1,N1}) @@ -165,6 +171,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) b = @benchmark mul!($C,Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(C, Transpose(A), Transpose(B), α, β) # records 64 bytes # Transpose Output C = rand(Mat{N1,N2}) @@ -172,6 +179,7 @@ function test_multiply_add(N1,N2,ArrayType=MArray) @test C' ≈ A'B' b = @benchmark mul!(Transpose($C),Transpose($A),Transpose($B),$α,$β) samples=10 evals=10 @test minimum(b).allocs == 0 + # @test_noalloc mul!(Transpose(C), Transpose(A), Transpose(B), α, β) # records 80 bytes end # Test the three different From 11eae73a2b6e072731897672f29857b2ca04cc71 Mon Sep 17 00:00:00 2001 From: Brian Jackson Date: Tue, 14 Apr 2020 10:02:09 -0700 Subject: [PATCH 8/8] Delete out commented code --- src/matrix_multiply_add.jl | 69 ------------------------------------- test/matrix_multiply_add.jl | 29 ++++------------ 2 files changed, 7 insertions(+), 91 deletions(-) diff --git a/src/matrix_multiply_add.jl b/src/matrix_multiply_add.jl index 27a42226..1802b8bb 100644 --- a/src/matrix_multiply_add.jl +++ b/src/matrix_multiply_add.jl @@ -281,72 +281,3 @@ end @inline mul_blas!(Sc::TSize{<:Any,true}, c::StaticMatrix, Sa::TSize, Sb::TSize, a::StaticMatrix, b::StaticMatrix, _add::MulAddMul) = mul_blas!(transpose(Sc), c, transpose(Sb), transpose(Sa), b, a, _add) - -# TODO: Get this version of mul_blas! working so there's backward-compatibility with older Julia versions - -# @generated function mul_blas!(::TSize{s,false}, c::StaticMatrix{<:Any, <:Any, T}, -# ::TSize{sa,tA}, ::TSize{sb,tB}, -# a::StaticMatrix{<:Any, <:Any, T}, b::StaticMatrix{<:Any, <:Any, T}, -# _add::MulAddMul = MulAddMul()) where {s,sa,sb, T <: BlasFloat, tA, tB} -# if sb[1] != sa[2] || sa[1] != s[1] || sb[2] != s[2] -# throw(DimensionMismatch("Tried to multiply arrays of size $sa and $sb and assign to array of size $s")) -# end -# -# if sa[1] > 0 && sa[2] > 0 && sb[2] > 0 -# # This code adapted from `gemm!()` in base/linalg/blas.jl -# -# if T == Float64 -# gemm = :dgemm_ -# elseif T == Float32 -# gemm = :sgemm_ -# elseif T == Complex{Float64} -# gemm = :zgemm_ -# else # T == Complex{Float32} -# gemm = :cgemm_ -# end -# -# mat_char(tA) = tA ? 'T' : 'N' -# transA = mat_char(tA) -# transB = mat_char(tB) -# -# m = sa[tA ? 2 : 1] -# ka = sa[tA ? 1 : 2] -# kb = sb[tB ? 2 : 1] -# n = sb[tB ? 1 : 2] -# -# blascall = quote -# ccall((LinearAlgebra.BLAS.@blasfunc($gemm), LinearAlgebra.BLAS.libblas), Nothing, -# (Ref{UInt8}, Ref{UInt8}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{LinearAlgebra.BLAS.BlasInt}, -# Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, -# Ptr{$T}, Ref{LinearAlgebra.BLAS.BlasInt}, Ref{$T}, Ptr{$T}, -# Ref{LinearAlgebra.BLAS.BlasInt}), -# $transA, $transB, $m, $n, -# $ka, alpha, A, strideA, -# B, strideB, beta, C, -# strideC) -# end -# -# return quote -# println($transA) -# println($transB) -# alpha = _add.alpha -# beta = _add.beta -# # m = $(sa[1]) -# # ka = $(sa[2]) -# # kb = $(sb[1]) -# # n = $(sb[2]) -# strideA = $m #$(sa[1]) -# strideB = $kb #$(sb[1]) -# strideC = $(s[1]) -# A = _get_raw_data(a) -# B = _get_raw_data(b) -# C = _get_raw_data(c) -# -# $blascall -# -# return c -# end -# else -# throw(DimensionMismatch("Cannot call BLAS gemm with zero-dimension arrays, attempted $sa * $sb -> $s.")) -# end -# end diff --git a/test/matrix_multiply_add.jl b/test/matrix_multiply_add.jl index dccafb53..cf6cf3c5 100644 --- a/test/matrix_multiply_add.jl +++ b/test/matrix_multiply_add.jl @@ -2,28 +2,13 @@ using StaticArrays using LinearAlgebra using BenchmarkTools using Test -# function benchmark_matmul(N1,N2,ArrayType=MArray) -# if ArrayType <: MArray -# Mat = MMatrix -# Vec = MVector -# elseif ArrayType <: SizedArray -# Mat = SizedMatrix -# Vec = SizedVector -# end -# α,β = 1.0, 1.0 -# A = rand(Mat{N1,N2}) -# B = rand(Mat{N2,N2}) -# C = rand(Mat{N1,N2}) -# println("C = A*B") -# @btime mul!($C,$A,$B) -# println("C = A*B + C") -# @btime mul!($C,$A,$B,$α,$β) -# println("B = A'C") -# @btime mul!($B,Transpose($A),$C) -# println("B = A'C + B") -# @btime mul!($B,Transpose($A),$C,$α,$β) -# end -# benchmark_matmul(20,20,SizedArray) + +macro test_noalloc(ex) + esc(quote + $ex + @test(@allocated($ex) == 0) + end) +end # check_dims @test StaticArrays.check_dims(Size(4,), Size(4,3), Size(3,))