Skip to content

Commit d16a4b3

Browse files
mcabbottMichael Abbott
andauthored
Allow PermutedDimsArray in gemm_strided_batched (#539)
Co-authored-by: Michael Abbott <me@escbook>
1 parent 397bde0 commit d16a4b3

File tree

6 files changed

+70
-29
lines changed

6 files changed

+70
-29
lines changed

Manifest.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -105,10 +105,10 @@ uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
105105
uuid = "a63ad114-7e13-5084-954f-fe012c677804"
106106

107107
[[NNlib]]
108-
deps = ["Libdl", "LinearAlgebra", "Pkg", "Requires", "Statistics"]
109-
git-tree-sha1 = "a8180fd1445e31c0b1add98dae8da694ac2c23fd"
108+
deps = ["Compat", "Libdl", "LinearAlgebra", "Pkg", "Requires", "Statistics"]
109+
git-tree-sha1 = "1ae42464fea5258fd2ff49f1c4a40fc41cba3860"
110110
uuid = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
111-
version = "0.7.6"
111+
version = "0.7.7"
112112

113113
[[OrderedCollections]]
114114
git-tree-sha1 = "cf59cfed2e2c12e8a2ff0a4f1e9b2cd8650da6db"

Project.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ GPUArrays = "6.1.0"
3737
GPUCompiler = "0.8.1"
3838
LLVM = "3"
3939
MacroTools = "0.5"
40-
NNlib = "0.6.5, 0.7"
40+
NNlib = "0.7.7"
4141
Reexport = "0.2"
4242
Requires = "0.5, 1.0"
4343
TimerOutputs = "0.5"

lib/cublas/wrappers.jl

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -923,15 +923,16 @@ for (fname, elty) in
923923
function gemm_strided_batched!(transA::Char,
924924
transB::Char,
925925
alpha::Number,
926-
A::DenseCuArray{$elty, 3},
927-
B::DenseCuArray{$elty, 3},
926+
A::AbstractArray{$elty, 3}, # allow PermutedDimsArray
927+
B::AbstractArray{$elty, 3},
928928
beta::Number,
929-
C::DenseCuArray{$elty, 3})
929+
C::AbstractArray{$elty, 3})
930930
m = size(A, transA == 'N' ? 1 : 2)
931931
k = size(A, transA == 'N' ? 2 : 1)
932932
n = size(B, transB == 'N' ? 2 : 1)
933933

934-
@assert size(A, 3) == size(B, 3) == size(C, 3) "Batch size mismatch"
934+
@assert size(A, 3) == size(C, 3) || size(A, 3) == 1 "batch size mismatch: A != C"
935+
@assert size(B, 3) == size(C, 3) || size(B, 3) == 1 "batch size mismatch: B != C"
935936

936937
if m != size(C,1) || n != size(C,2) || k != size(B, transB == 'N' ? 1 : 2)
937938
throw(DimensionMismatch(""))
@@ -940,26 +941,26 @@ for (fname, elty) in
940941
ldb = max(1,stride(B,2))
941942
ldc = max(1,stride(C,2))
942943

943-
strideA = stride(A, 3)
944-
strideB = stride(B, 3)
944+
strideA = size(A, 3) == 1 ? 0 : stride(A, 3)
945+
strideB = size(B, 3) == 1 ? 0 : stride(B, 3)
945946
strideC = stride(C, 3)
946-
batchCount = size(A, 3)
947+
batchCount = size(C, 3)
947948
$fname(handle(), transA, transB, m, n, k, alpha, A, lda, strideA, B,
948949
ldb, strideB, beta, C, ldc, strideC, batchCount)
949950
C
950951
end
951952
function gemm_strided_batched(transA::Char,
952953
transB::Char,
953954
alpha::Number,
954-
A::DenseCuArray{$elty, 3},
955-
B::DenseCuArray{$elty, 3})
956-
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), size(A, 3)))
955+
A::AbstractArray{$elty, 3},
956+
B::AbstractArray{$elty, 3})
957+
C = similar(B, (size(A, transA == 'N' ? 1 : 2), size(B, transB == 'N' ? 2 : 1), max(size(A, 3), size(B, 3))))
957958
gemm_strided_batched!(transA, transB, alpha, A, B, zero($elty), C )
958959
end
959960
function gemm_strided_batched(transA::Char,
960961
transB::Char,
961-
A::DenseCuArray{$elty, 3},
962-
B::DenseCuArray{$elty, 3})
962+
A::AbstractArray{$elty, 3},
963+
B::AbstractArray{$elty, 3})
963964
gemm_strided_batched(transA, transB, one($elty), A, B)
964965
end
965966
end

src/array.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -429,6 +429,12 @@ function Base.unsafe_convert(::Type{CuPtr{T}}, V::SubArray{T,N,P,<:Tuple{Vararg{
429429
end
430430

431431

432+
## PermutedDimsArray
433+
434+
Base.unsafe_convert(::Type{CuPtr{T}}, A::PermutedDimsArray) where {T} =
435+
Base.unsafe_convert(CuPtr{T}, parent(A))
436+
437+
432438
## reshape
433439

434440
# optimize reshape to return a CuArray

src/nnlib.jl

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -23,16 +23,9 @@ end
2323

2424

2525
# Batched matrix multiplication
26+
# 1st argument is produced by NNlib.storage_type(A)
27+
NNlib._batched_gemm!(::Type{<:CuArray}, transA::Char, transB::Char, α::Number, A, B, β::Number, C) =
28+
CUBLAS.gemm_strided_batched!(transA, transB, α, A, B, β, C)
2629

27-
const batched_gemm_args = [
28-
(:(CuArray{T, 3}), 'N'),
29-
(:(NNlib.BatchedTranspose{T, <:CuArray{T, 3}}), 'T'),
30-
(:(NNlib.BatchedAdjoint{T, <:CuArray{T, 3}}), 'C')
31-
]
32-
33-
for (TA, transA) in batched_gemm_args, (TB, transB) in batched_gemm_args
34-
@eval function NNlib.batched_mul!(C::CuArray{T, 3}, A::$TA, B::$TB) where {T<:CUBLAS.CublasFloat}
35-
CUBLAS.gemm_strided_batched!($transA, $transB, one(T), NNlib._unbatch(A), NNlib._unbatch(B), zero(T), C)
36-
C
37-
end
38-
end
30+
Base.unsafe_convert(::Type{CuPtr{T}}, A::NNlib.BatchedAdjOrTrans{T}) where {T} =
31+
Base.unsafe_convert(CuPtr{T}, parent(A))

test/nnlib.jl

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
using NNlib
22

33
@testset "batched_mul" begin
4-
using NNlib: batched_mul, batched_adjoint, batched_transpose
4+
using NNlib: batched_mul, batched_mul!, batched_vec, batched_adjoint, batched_transpose
55

66
A = randn(Float32, 3,3,2);
77
B = randn(Float32, 3,3,2);
@@ -14,6 +14,47 @@ using NNlib
1414

1515
Ca = batched_mul(A, batched_adjoint(B))
1616
@test CuArray(Ca) batched_mul(CuArray(A), batched_adjoint(CuArray(B)))
17+
18+
# 5-arg batched_mul!
19+
C .= pi
20+
batched_mul!(C, A, B, 2f0, 3f0)
21+
cuCpi = CuArray(similar(C)) .= pi
22+
@test CuArray(C) batched_mul!(cuCpi, CuArray(A), CuArray(B), 2f0, 3f0)
23+
24+
# PermutedDimsArray
25+
@test CuArray(Ct) batched_mul(PermutedDimsArray(CuArray(A), (2,1,3)), CuArray(B))
26+
27+
D = permutedims(B, (1,3,2))
28+
Cp = batched_mul(batched_adjoint(A), B)
29+
@test CuArray(Cp) batched_mul(batched_adjoint(CuArray(A)), PermutedDimsArray(CuArray(D), (1,3,2)))
30+
31+
# Methods which reshape
32+
M = randn(Float32, 3,3)
33+
34+
Cm = batched_mul(A, M)
35+
@test CuArray(Cm) batched_mul(CuArray(A), CuArray(M))
36+
37+
Cv = batched_vec(permutedims(A,(3,1,2)), M)
38+
@test CuArray(Cv) batched_vec(PermutedDimsArray(CuArray(A),(3,1,2)), CuArray(M))
39+
end
40+
41+
@testset "NNlib storage_type etc." begin
42+
using LinearAlgebra
43+
using NNlib: is_strided, are_strided, storage_type
44+
45+
M = cu(ones(10,10))
46+
47+
@test is_strided(M)
48+
@test is_strided(view(M, 1:2:5,:))
49+
@test is_strided(PermutedDimsArray(M, (2,1)))
50+
51+
@test !is_strided(reshape(view(M, 1:2:10,:), 10,:))
52+
@test !is_strided((M .+ im)')
53+
@test !is_strided(Diagonal(cu(ones(3))))
54+
55+
@test storage_type(M) == CuArray{Float32,2}
56+
@test storage_type(reshape(view(M, 1:2:10,:), 10,:)) == CuArray{Float32,2}
57+
1758
end
1859

1960
@testset "Broadcast Fix" begin

0 commit comments

Comments
 (0)