diff --git a/Project.toml b/Project.toml index c9d8f73..f1a521d 100644 --- a/Project.toml +++ b/Project.toml @@ -1,2 +1,14 @@ +name = "ParallelMergeCSR" +uuid = "c4b929cc-6141-4ba4-8849-280a863346e1" +version = "0.1.0" + [deps] +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index ca33a3a..1a74996 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -1,13 +1,187 @@ module ParallelMergeCSR - +using Atomix +using Base.Threads using SparseArrays +using LinearAlgebra: Adjoint,adjoint,Transpose,transpose +using SparseArrays: AbstractSparseMatrixCSC, + DenseInputVecOrMat, + getcolptr + +mutable struct Coordinate + x::Int + y::Int +end + +# Want to give the illusion that we're still using 0-based coordinates when the actual +# values themselves are 1-based +# +# diagonal -> i + j = k +# a_len and b_len can stay the same from the paper +# +# a -> row-end offsets so really pass in a[2:end] +# b -> "natural" numbers +function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) + # Diagonal search range (in x coordinate space) + x_min = max(diagonal - b_len, 0) + x_max = min(diagonal, a_len) + # 2D binary-search along diagonal search range + while (x_min < x_max) + pivot = (x_min + x_max) >> 1 + if (a[pivot + 1] <= b[diagonal - pivot]) + x_min = pivot + 1 + else + x_max = pivot + end + end + + return Coordinate( + min(x_min, a_len), + diagonal - x_min + ) + +end + +# Internally treats the matrix as if its been transposed/is an adjoint +# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op +function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) + + nzv = nonzeros(A) + rv = rowvals(A) + cp = getcolptr(A) + + nnz = length(nzv) + + # nrows = length(cp) - 1 can give the wrong number of rows! + nrows = A.n + + nz_indices = collect(1:nnz) + row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats + num_merge_items = nnz + nrows # preserve the dimensions of the original matrix + + num_threads = nthreads() + items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads + + row_carry_out = zeros(eltype(cp), num_threads) + value_carry_out = zeros(eltype(output), num_threads) # value must match output + + # Julia threads start id by 1, so make sure to offset! + @threads for tid in 1:num_threads + diagonal = min(items_per_thread * (tid - 1), num_merge_items) + diagonal_end = min(diagonal + items_per_thread, num_merge_items) + + # Get starting and ending thread coordinates (row, nzv) + thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) + + # Consume merge items, whole rows first + running_total = zero(eltype(output)) + while thread_coord.x < thread_coord_end.x + @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + thread_coord.y += 1 + end + + @inbounds output[thread_coord.x + 1] += α * running_total + running_total = zero(eltype(output)) + thread_coord.x += 1 + end + + # May have thread end up partially consuming a row. + # Save result form partial consumption and do one pass at the end to add it back to y + while thread_coord.y < thread_coord_end.y + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + thread_coord.y += 1 + end + + # Save carry-outs + @inbounds row_carry_out[tid] = thread_coord_end.x + 1 + @inbounds value_carry_out[tid] = running_total + + end + + for tid in 1:num_threads + @inbounds if row_carry_out[tid] <= nrows + @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid] + end + end + +end + + +# C = adjoint(A)Bα + Cβ +# C = transpose(A)B + Cβ +# C = xABα + Cβ +for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) + @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + # obtains the original matrix underneath the "lazy wrapper" + A = xA.parent + size(A, 2) == size(C, 1) || throw(DimensionMismatch()) + size(A, 1) == size(B, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + if β != 1 + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for (col_idx, input) in enumerate(eachcol(B)) + output = @view C[:, col_idx] + merge_csr_mv!(α, A, input, output, $t) + end + C + # end of @eval macro + end + # end of for loop +end + + +## function names should be suffixed w/ exclamation mark +function atomic_add!(a::AbstractVector{T},b::T) where T <: Real + # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes + # you'd have to wrap each element of the array with Atomic + # Objective: for each element in AbstractVector, atomically add b to the element + # Can also use Atomix.jl + for idx in 1:length(a) + Atomix.@atomic a[idx] += b + end + +end -include("types.jl") -include("threaded.jl") +function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex + # return type representing real part + view_a = reinterpret(reshape, real(T),a) + real_b = real(b) + imag_b = imag(b) -# include("gpu.jl") -# include("distributed.jl") + # mutating the view also mutates the underlying array it came from + for idx in 1:size(view_a, 2) + Atomix.@atomic view_a[1, idx] += real_b + Atomix.@atomic view_a[2, idx] += imag_b + end +end +function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) + size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + size(A, 1) == size(C, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + nzv = nonzeros(A) + rv = rowvals(A) + if β != 1 + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for k in 1:size(C, 2) + @threads for col in 1:size(A, 2) + @inbounds αxj = B[col,k] * α + for j in nzrange(A, col) + @inbounds val = nzv[j]*αxj + @inbounds row = rv[j] + @inbounds out = @view C[row:row, k] + atomic_add!(out,val) + end + end + end + C +end +# end of the module end \ No newline at end of file diff --git a/src/threaded.jl b/src/threaded.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/types.jl b/src/types.jl deleted file mode 100644 index e69de29..0000000 diff --git a/test/atomic_add!.jl b/test/atomic_add!.jl new file mode 100644 index 0000000..5814afa --- /dev/null +++ b/test/atomic_add!.jl @@ -0,0 +1,27 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +@testset "atomic_add!" begin + # atomic add with real + @testset "Real" begin + a = rand(Float64, 20) + a_copy = deepcopy(a) + + b = 1.25 + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end + + # atomic add with complex + @testset "Complex" begin + a = rand(Complex{Float64}, 20) + a_copy = deepcopy(a) + + b = 2.5 + 0.8im + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end +end \ No newline at end of file diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl new file mode 100644 index 0000000..0eda52c --- /dev/null +++ b/test/merge_csr_mv!.jl @@ -0,0 +1,225 @@ +using Test +using ParallelMergeCSR: merge_csr_mv! +using SparseArrays + +## NOTE: Sparse matrices are converted to dense form in the @test's +## considering that our redefinition of SparseArrays.mul! seems to +## interfere +@testset "Extreme Cases" begin + + @testset "Singleton (Real)" begin + A = sparse(reshape([1], 1, 1)) + + x = rand(1) + + y = zeros(size(A, 1)) + + merge_csr_mv!(0.3, A, x, y, transpose) + + @test Matrix(A) * x * 0.3 == y + end + + @testset "Singleton (Complex)" begin + A = sparse(reshape([1.0+2.5im], 1, 1)) + + x = rand(1) + + y = zeros(eltype(A), size(A, 1)) + + merge_csr_mv!(10.1, A, x, y, adjoint) + + @test adjoint(Matrix(A)) * x * 10.1 == y + end + + @testset "Single row (Real)" begin + + # 10 x 1 converted to 1 x 10 + A = 10.0 * SparseArrays.sprand(10, 1, 0.3) + + # x needs to be 10 x 1 + x = rand(size(A, 1)) + + y = zeros(eltype(A), 1) + + merge_csr_mv!(1.1, A, x, y, transpose) + + @test (transpose(Matrix(A)) * x) * 1.1 ≈ y + end + + @testset "Single row (Complex)" begin + + # 10 x 1 is treated as 1 x 10 inside merge_csr_mv! + A = SparseArrays.sprand(Complex{Float64}, 10, 1, 0.3) + + x = rand(eltype(A), size(A, 1)) + + y = zeros(eltype(A), 1) + + merge_csr_mv!(1.1, A, x, y, transpose) + + @test (transpose(Matrix(A)) * x) * 1.1 ≈ y + end + + + #= + Single Column test wouldn't work because + merge_csr_mv! is designed to handle Ax = y where + A is m x n, x is n x 1, and y must be m x 1. + + Given a single column A that is 1 x z, merge_csr_mv! treats it as + z x 1, which means y is constrained to 1 x 1 and the correct + output would be z x 1, which merge_csr_mv! is not designed for. + + On the other hand, the overridden SparseArrays.mul! should work + =# +end + +# test fails, y seems to have lots of zero-entries +@testset "Square (Real)" begin + A = SparseArrays.sprand(10,10,0.3) + + # 10 x 1 + x = rand(10) + + # 10 x 1 + y = zeros(size(x)) + + merge_csr_mv!(1.1, A, x, y, adjoint) + + @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y + +end + +@testset "Square (Complex)" begin + A = SparseArrays.sprand(Complex{Float64}, 10, 10, 0.3) + + x = 10 * rand(Complex{Float64}, 10) + + y = zeros(eltype(A), size(x)) + + merge_csr_mv!(1.1, A, x, y, adjoint) + + @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y +end + +@testset "4x6 (Real)" begin + # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix + m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] + + # get into CSC form + A = sparse(m) + + # create vector + x = [5,2,3,1] + + # create empty solution + y = zeros(Int64, size(A, 2)) + + # multiply + merge_csr_mv!(2.0, A, x, y, adjoint) + + + @test Matrix(adjoint(m)) * x * 2.0 == y +end + +@testset "4 x 6 (Complex)" begin + # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix + m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] + + # get into CSC form + A = sparse(m) + + # create vector + x = 22.1 * rand(Complex{Float64}, 4) + + # create empty solution + y = zeros(eltype(x), size(A, 2)) + + # multiply + merge_csr_mv!(2.0, A, x, y, adjoint) + + @test Matrix(adjoint(m)) * x * 2.0 == y + +end + +@testset "100x100 (Real)" begin + # create matrix + A = SparseArrays.sprand(100, 100, 0.3) + + # create vector + x = rand(100) + + # create empty solution + y = zeros(size(A, 1)) + + merge_csr_mv!(3.0, A, x, y, transpose) + + @test transpose(Matrix(A)) * x * 3 ≈ y +end + +@testset "100x100 (Complex)" begin + # create matrix + A = SparseArrays.sprand(Complex{Float64}, 100, 100, 0.3) + + # create vector + x = rand(Complex{Float64}, 100) + + # create empty solution + y = zeros(eltype(x), size(A, 1)) + + merge_csr_mv!(3.0, A, x, y, transpose) + + @test transpose(Matrix(A)) * x * 3 ≈ y +end + +#= + NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul! + should be the preferred method to do Matrix-Matrix multiplication. +=# +@testset "Matrix-Matrix (Real)" begin + + # Create Matrices + # 3 x 2 (treated as 2 x 3 in merge_csr_mv!) + A = sprand(3, 2, 0.4) + # 3 x 4 + X = rand(3, 4) + + α = 9.2 + + Y = zeros(2, 4) + + for (idx, col) in enumerate(eachcol(X)) + Y_view = @view Y[:, idx] + merge_csr_mv!(α, A, col, Y_view, transpose) + end + + @test transpose(Matrix(A)) * X * 9.2 ≈ Y + +end + +@testset "Matrix-Matrix (Real)" begin + + # Create Matrices + # 3 x 2 (treated as 2 x 3 in merge_csr_mv!) + A = sprand(Complex{Float64}, 3, 2, 0.4) + # 3 x 4 + X = rand(3, 4) + + α = 9.2 + + Y = zeros(eltype(A), 2, 4) + + for (idx, col) in enumerate(eachcol(X)) + Y_view = @view Y[:, idx] + merge_csr_mv!(α, A, col, Y_view, adjoint) + end + + @test adjoint(Matrix(A)) * X * 9.2 ≈ Y + +end \ No newline at end of file diff --git a/test/mul!.jl b/test/mul!.jl new file mode 100644 index 0000000..5b22b6a --- /dev/null +++ b/test/mul!.jl @@ -0,0 +1,186 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +# trigger merge_csr_mv! in this repo, does not default to mul! somewhere else +@testset "Adjoint Real" begin + + A = adjoint(sprand(5, 5, 0.3)) + + B = rand(5, 5) + α = 11.2 + + C = rand(5, 5) + C_copy = deepcopy(C) + β = 3.9 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Adjoint Complex" begin + + # C = adjoint(A)Bα + Cβ + A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3)) + B = rand(5,5) + α = 2.3 + + C = rand(Complex{Float64}, 5, 5) + C_copy = deepcopy(C) + β = 1.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end +# trigger merge_csr_mv! + +@testset "Transpose Square Real" begin + + # C = transpose(A)Bα + Cβ + A = transpose(sparse(rand(1:5, 4, 4))) + B = rand(1:5, (4,4)) + α = 1.0 + + C = zeros(4, 4) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + # right hand side is correct, left hand side is problematic + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + + +@testset "Transpose Square Complex" begin + # C = transpose(A)Bα + Cβ + A = transpose(sparse(rand(Complex{Float64}, 4, 4))) + B = rand(4,4) + α = 1.0 + + C = zeros(eltype(A), 4, 4) + C_copy = deepcopy(C) + β = 6.11 + 9.2im + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β +end + +@testset "Transpose Rectangular Real" begin + + A = sparse(rand(4, 2)) |> transpose + B = rand(4, 3) + + α = 9.1 + + C = zeros(eltype(A), 2, 3) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Transpose Rectangular Complex" begin + + A = sparse(rand(Complex{Float64}, 4, 2)) |> transpose + B = rand(4, 3) + + α = 0.0 + 5.3im + + C = zeros(eltype(A), 2, 3) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Matrix (Real)" begin + + # A should be sparse, B should be dense + A = sprand(5, 5, 0.2) + B = rand(5, 5) + α = 1.1 + + C = zeros(5, 5) + C_copy = deepcopy(C) + β = 0.3 + + SparseArrays.mul!(C, A, B, α, β) + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Matrix (Complex)" begin + + # A should be sparse, B should be dense + A = sprand(5, 5, 0.2) + B = rand(Complex{Float64}, 5, 5) + α = 1.1 + + C = zeros(eltype(B), 5, 5) + C_copy = deepcopy(C) + β = 0.3 + + SparseArrays.mul!(C, A, B, α, β) + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Vector (Real)" begin + + A = sprand(5, 5, 0.3) + B = rand(5) + α = 2.5 + + C = zeros(Float64, size(B)) + C_copy = deepcopy(C) + β = 5.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β +end + +@testset "Matrix x Vector (Real)" begin + + A = sprand(5, 5, 0.3) + B = rand(5) + α = 2.5 + + C = zeros(Float64, size(B)) + C_copy = deepcopy(C) + β = 5.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Vector (Complex)" begin + + A = sprand(5, 5, 0.3) + B = rand(Complex{Float64}, 5) + α = 2.5 + + C = zeros(eltype(B), size(B)) + C_copy = deepcopy(C) + β = 2.1+0.1im + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index e69de29..ecfe12b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -0,0 +1,13 @@ +using Test + +@testset "merge_csr_mv!" begin + include("merge_csr_mv!.jl") +end + +@testset "atomic_add!" begin + include("atomic_add!.jl") +end + +@testset "mul!" begin + include("mul!.jl") +end \ No newline at end of file diff --git a/test/threaded.jl b/test/threaded.jl deleted file mode 100644 index 08debd5..0000000 --- a/test/threaded.jl +++ /dev/null @@ -1,9 +0,0 @@ -using SparseArrays -using Random - - - - - - - diff --git a/test/types.jl b/test/types.jl deleted file mode 100644 index e69de29..0000000