From 51c8ece3d32e43e993b099eba02dcc9957296cce Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Tue, 20 Dec 2022 16:45:02 -0500 Subject: [PATCH 01/26] removing old files and setting todos --- src/ParallelMergeCSR.jl | 136 ++++++++++++++++++++++++++++++++++++++-- src/threaded.jl | 0 src/types.jl | 0 test/threaded.jl | 9 --- test/types.jl | 0 5 files changed, 131 insertions(+), 14 deletions(-) delete mode 100644 src/threaded.jl delete mode 100644 src/types.jl delete mode 100644 test/threaded.jl delete mode 100644 test/types.jl diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index ca33a3a..f9853bd 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -1,13 +1,139 @@ module ParallelMergeCSR +using SparseArrays: AbstractSparseMatrixCSC +using LinearAlgebra:Adjoint,adjoint,Transpose,transpose -using SparseArrays +using Base.Threads -include("types.jl") -include("threaded.jl") +mutable struct Coordinate + x::Int + y::Int +end -# include("gpu.jl") -# include("distributed.jl") +# Want to give the illusion that we're still using 0-based coordinates when the actual +# values themselves are 1-based +# +# diagonal -> i + j = k +# a_len and b_len can stay the same from the paper +# +# a -> row-end offsets so really pass in a[2:end] +# b -> "natural" numbers +function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) + # Diagonal search range (in x coordinate space) + x_min = max(diagonal - b_len, 0) + x_max = min(diagonal, a_len) + # 2D binary-search along diagonal search range + while (x_min < x_max) + pivot = (x_min + x_max) >> 1 + if (a[pivot + 1] <= b[diagonal - pivot]) + x_min = pivot + 1 + else + x_max = pivot + end + end + + return Coordinate( + min(x_min, a_len), + diagonal - x_min + ) + +end + +# Ax = y +# A matrix +# x, y vectors + +# TODO: overload SparseArrays.mul! +# TODO: add β and α arguments +# TODO: change SparseMatrixCSR => Transpose{AbstractSparseMatrixCSC} +# 1. colval => rowval +# 2. rowptr => colptr +# 3. use iterface of SparseArrays to get these quantities +function merge_csr_mv!(A::SparseMatrixCSR, x, y) + row_end_offsets = A.rowptr[2:end] + nnz_indices = collect(1:length(A.nzval)) + num_merge_items = length(A.nzval) + A.m + + num_threads = nthreads() + items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads + + row_carry_out = zeros(Int, num_threads) + value_carry_out = zeros(Float64, num_threads) + + # Julia threads start id by 1, so make sure to offset! + @threads for tid in 1:num_threads + diagonal = min(items_per_thread * (tid - 1), num_merge_items) + diagonal_end = min(diagonal + items_per_thread, num_merge_items) + + # Get starting and ending thread coordinates (row, nnz) + thread_coord = merge_path_search(diagonal, A.m, length(A.nzval), row_end_offsets, nnz_indices) + thread_coord_end = merge_path_search(diagonal_end, A.m, length(A.nzval), row_end_offsets, nnz_indices) + + # Consume merge items, whole rows first + running_total = 0.0 + while thread_coord.x < thread_coord_end.x + while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 + running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]] + thread_coord.y += 1 + end + + y[thread_coord.x + 1] = running_total + running_total = 0.0 + thread_coord.x += 1 + end + + # May have thread end up partially consuming a row. + # Save result form partial consumption and do one pass at the end to add it back to y + while thread_coord.y < thread_coord_end.y + running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]] + thread_coord.y += 1 + end + + # Save carry-outs + row_carry_out[tid] = thread_coord_end.x + 1 + value_carry_out[tid] = running_total + + end + + # Diagonistcs + # println("row_carry_out: $row_carry_out") + # println("value_carry_out: $value_carry_out") + # println("Incomplete y: $y") + for tid in 1:num_threads + if row_carry_out[tid] <= A.m + y[row_carry_out[tid]] += value_carry_out[tid] + end + end + +end + + +# y => C[:,k] +# x => B[:,k] +for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) + @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + A = xA.parent + size(A, 2) == size(C, 1) || throw(DimensionMismatch()) + size(A, 1) == size(B, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + nzv = nonzeros(A) + rv = rowvals(A) + if β != 1 + β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for k in 1:size(C, 2) + # parallel implementation goes here acting on slice C[:,k] + @inbounds for col in 1:size(A, 2) + tmp = zero(eltype(C)) + for j in nzrange(A, col) + tmp += $t(nzv[j])*B[rv[j],k] + end + C[col,k] += tmp * α + end + end + C + end +end end \ No newline at end of file diff --git a/src/threaded.jl b/src/threaded.jl deleted file mode 100644 index e69de29..0000000 diff --git a/src/types.jl b/src/types.jl deleted file mode 100644 index e69de29..0000000 diff --git a/test/threaded.jl b/test/threaded.jl deleted file mode 100644 index 08debd5..0000000 --- a/test/threaded.jl +++ /dev/null @@ -1,9 +0,0 @@ -using SparseArrays -using Random - - - - - - - diff --git a/test/types.jl b/test/types.jl deleted file mode 100644 index e69de29..0000000 From 931e670ae0b52279f8f49a9a54d42563280cc64e Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Wed, 28 Dec 2022 12:36:04 -0500 Subject: [PATCH 02/26] override SparseArrays Mul! --- src/ParallelMergeCSR.jl | 81 ++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 22 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index f9853bd..dad0316 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -49,10 +49,14 @@ end # 1. colval => rowval # 2. rowptr => colptr # 3. use iterface of SparseArrays to get these quantities -function merge_csr_mv!(A::SparseMatrixCSR, x, y) - row_end_offsets = A.rowptr[2:end] - nnz_indices = collect(1:length(A.nzval)) - num_merge_items = length(A.nzval) + A.m +function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op) + rv = rowvals(A) + nzval = nonzeros(A) + nrow = size(A,2) # view CSC tranpose as CSR. + + row_end_offsets = rowvals[2:end] + nnz_indices = collect(1:length(nzval)) + num_merge_items = length(nzval) + nrow num_threads = nthreads() items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads @@ -66,26 +70,26 @@ function merge_csr_mv!(A::SparseMatrixCSR, x, y) diagonal_end = min(diagonal + items_per_thread, num_merge_items) # Get starting and ending thread coordinates (row, nnz) - thread_coord = merge_path_search(diagonal, A.m, length(A.nzval), row_end_offsets, nnz_indices) - thread_coord_end = merge_path_search(diagonal_end, A.m, length(A.nzval), row_end_offsets, nnz_indices) + thread_coord = merge_path_search(diagonal, nrow, length(nzval), row_end_offsets, nnz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrow, length(nzval), row_end_offsets, nnz_indices) # Consume merge items, whole rows first - running_total = 0.0 + running_total = zero(eltype(y)) while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]] + running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end - y[thread_coord.x + 1] = running_total - running_total = 0.0 + output[thread_coord.x + 1] = running_total + running_total = zero(eltype(y)) thread_coord.x += 1 end # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]] + running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end @@ -100,8 +104,8 @@ function merge_csr_mv!(A::SparseMatrixCSR, x, y) # println("value_carry_out: $value_carry_out") # println("Incomplete y: $y") for tid in 1:num_threads - if row_carry_out[tid] <= A.m - y[row_carry_out[tid]] += value_carry_out[tid] + if row_carry_out[tid] <= nrow + output[row_carry_out[tid]] += value_carry_out[tid] end end @@ -116,24 +120,57 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) size(A, 2) == size(C, 1) || throw(DimensionMismatch()) size(A, 1) == size(B, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - nzv = nonzeros(A) - rv = rowvals(A) + # nzv = nonzeros(A) + # rv = rowvals(A) if β != 1 β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) end for k in 1:size(C, 2) # parallel implementation goes here acting on slice C[:,k] - @inbounds for col in 1:size(A, 2) - tmp = zero(eltype(C)) - for j in nzrange(A, col) - tmp += $t(nzv[j])*B[rv[j],k] - end - C[col,k] += tmp * α - end + y = @view B[:,k] + x = @view C[:,k] + merge_csr_mv!(A, x, y, $t) end C end end +function atomic_add(a::AbstractVector{T},b::T) where T <: Real + @atomic a[1] += b +end + +function atomic_add(a::AbstractVector{T},b::T) where T <: Complex + view_a = reinterpret(real(T),a) + real_b = real(b) + imag_b = imag(b) + + @atomic view_a[1] += real_b + @atomic view_a[2] += imag_b +end + +function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) + size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + size(A, 1) == size(C, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + nzv = nonzeros(A) + rv = rowvals(A) + if β != 1 + β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for k in 1:size(C, 2) + @threads for col in 1:size(A, 2) + @inbounds αxj = B[col,k] * α + for j in nzrange(A, col) + @inbounds val = nzv[j]*αxj + @inbounds row = rv[j] + @inbounds out = C[row:row, k] # pass element as view + atomic_add(out,val) + end + end + end + C +end + + end \ No newline at end of file From e8753b07c89fdd5e30d1a307e85d26225310199b Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Wed, 28 Dec 2022 12:55:48 -0500 Subject: [PATCH 03/26] updating TODOs --- src/ParallelMergeCSR.jl | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index dad0316..60cb6f9 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -43,12 +43,9 @@ end # A matrix # x, y vectors -# TODO: overload SparseArrays.mul! -# TODO: add β and α arguments -# TODO: change SparseMatrixCSR => Transpose{AbstractSparseMatrixCSC} -# 1. colval => rowval -# 2. rowptr => colptr -# 3. use iterface of SparseArrays to get these quantities +# TODO: +# 1. add @inbounds to remove bounds check +# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op) rv = rowvals(A) nzval = nonzeros(A) @@ -125,11 +122,9 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) if β != 1 β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) end - for k in 1:size(C, 2) - # parallel implementation goes here acting on slice C[:,k] - y = @view B[:,k] - x = @view C[:,k] - merge_csr_mv!(A, x, y, $t) + for (k,c) in enumerate(eachcol(C)) + # parallel implementation goes here acting on slice C[:,k], B[:,k] + merge_csr_mv!(A, B[:,k], c, $t) end C end From cd96e3485e587c4d91f1c65f417a80cb03101481 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Wed, 28 Dec 2022 12:56:01 -0500 Subject: [PATCH 04/26] adding tests --- test/runtests.jl | 103 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 103 insertions(+) diff --git a/test/runtests.jl b/test/runtests.jl index e69de29..42a40a2 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -0,0 +1,103 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +# works with AbstractSparseMatrixCSC or just plain AbstractMatrix +m_to_csr(M::AbstractMatrix) = M |> SparseArrays.sparse |> ParallelMergeCSR.SparseMatrixCSR + +@testset "Extreme Cases" begin + + @testset "Singleton" begin + full_m = reshape([1], 1, 1) + csr_m = m_to_csr(full_m) + + x = rand(1) + + y = zeros(csr_m.m) + + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test full_m * x == y + end + + @testset "Single row" begin + + csc_m = SparseArrays.sprand(10, 1, 0.3) + csr_m = m_to_csr(csc_m) + + x = rand(1:10, 1) + + y = zeros(csr_m.m) + + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test csc_m * x ≈ y + end + + @testset "Single column" begin + + csc_m = SparseArrays.sprand(10, 1, 0.3) + csr_m = m_to_csr(csc_m) + + x = rand(1:10, 1) + + y = zeros(csr_m.m) + + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test csc_m * x ≈ y + end +end + +@testset "Square" begin + # 10 x 10 with 30% chance of entry being made + full_m = SparseArrays.sprand(10,10,0.3) + csr_m = m_to_csr(full_m) + + x = rand(1:10, 10, 1) + + y = zeros(csr_m.m) + + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test full_m * x ≈ y + +end + +@testset "4x6" begin + # create matrix + full_m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] + + # get into CSR form + csr_m = m_to_csr(full_m) + + # create vector + x = [5,2,3,1,8,2] + + # create empty solution + y = zeros(Int64, csr_m.m) + + # multiply + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test full_m * x == y +end + +@testset "100x100" begin + # create matrix + csc_m = SparseArrays.sprand(100, 100, 0.3) + csr_m = m_to_csr(csc_m) + + # create vector + x = rand(1:100, 100, 1) + + # create empty solution + y = zeros(csr_m.m) + + ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + + @test csc_m * x ≈ y +end \ No newline at end of file From 8e12ae2d61a4d3a90684a61a12d5a5977fe7ff72 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Wed, 28 Dec 2022 12:58:54 -0500 Subject: [PATCH 05/26] adding += operator --- src/ParallelMergeCSR.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 60cb6f9..7ed32db 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -78,7 +78,7 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output: thread_coord.y += 1 end - output[thread_coord.x + 1] = running_total + output[thread_coord.x + 1] += running_total running_total = zero(eltype(y)) thread_coord.x += 1 end From ededb981621f542103f97cd9706b34eb934e80d6 Mon Sep 17 00:00:00 2001 From: John Long Date: Wed, 28 Dec 2022 17:08:38 -0500 Subject: [PATCH 06/26] Fixed broken .toml --- Project.toml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Project.toml b/Project.toml index c9d8f73..53aaee3 100644 --- a/Project.toml +++ b/Project.toml @@ -1,2 +1,7 @@ +name = "ParallelMergeCSR" +uuid = "c4b929cc-6141-4ba4-8849-280a863346e1" +version = "0.1.0" + [deps] +LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" From 4e979f52066a3087133629edbd998ec404e2543e Mon Sep 17 00:00:00 2001 From: John Long Date: Thu, 29 Dec 2022 13:48:23 -0500 Subject: [PATCH 07/26] Added testing ability to .toml, merge_csr_mv! works with CSC format and Number multiplier --- Project.toml | 7 ++++ src/ParallelMergeCSR.jl | 91 ++++++++++++++++++++++++++++------------- test/runtests.jl | 65 +++++++++++++---------------- 3 files changed, 99 insertions(+), 64 deletions(-) diff --git a/Project.toml b/Project.toml index 53aaee3..5f34ef8 100644 --- a/Project.toml +++ b/Project.toml @@ -5,3 +5,10 @@ version = "0.1.0" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" + + +[extras] +Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" + +[targets] +test = ["Test"] \ No newline at end of file diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 7ed32db..e0caf3a 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -1,7 +1,9 @@ module ParallelMergeCSR -using SparseArrays: AbstractSparseMatrixCSC -using LinearAlgebra:Adjoint,adjoint,Transpose,transpose +using SparseArrays +using SparseArrays: AbstractSparseMatrixCSC, + DenseInputVecOrMat +using LinearAlgebra: Adjoint,adjoint,Transpose,transpose using Base.Threads @@ -43,17 +45,51 @@ end # A matrix # x, y vectors +#= +Algorithm originally designed for CSR now needs to work for CSC + +CSC has fields: +* m - number of rows +* n - number of columns +* colptr::Vector - column j is in colptr[j]:(colptr[j+1]-1) +* rowval::Vector - row indices of stored values +* nzval::Vector - stored values, typically nonzeros + +CSR used to have fields: +* m - number of rows +* n - number of columns +* rowptr::Vector - row i is in rowptr[i]:(rowptr[i+1]-1) +* colval::Vector - col indices of stored values +* nzval::Vector - Stored values, typically non-zeros + +Given a matrix A in CSC, if you do transpose(A), indexing +individual elements is fine but it's a lazy transpose. +To materialize the changes, probably requires you perform some copy operation +e.g. can see changes via dump(copy(transpose(m))) where m isa AbstractMatrixCSC + +* NOTE: we don't want the transpose to actually change the matrix dimensions, +we do it just so the INTERNAL representation of the matrix looks like a CSR +=# + + # TODO: # 1. add @inbounds to remove bounds check -# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix -function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op) - rv = rowvals(A) - nzval = nonzeros(A) - nrow = size(A,2) # view CSC tranpose as CSR. - - row_end_offsets = rowvals[2:end] - nnz_indices = collect(1:length(nzval)) - num_merge_items = length(nzval) + nrow +# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix +# StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place +# Axβ = y +function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) + + # transpose the CSC to CSR + ## colptr in CSC equiv. to rowptr in CSR + ## rowval in CSC equiv. to colval in CSR + ## rows are now columns so the m x n dimensions after transpose are n x m + At = copy(transpose(A)) + row_end_offsets = At.colptr[2:end] + + nz = nonzeros(At) + nrows = size(A, 1) + nnz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats + num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix num_threads = nthreads() items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads @@ -67,26 +103,26 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output: diagonal_end = min(diagonal + items_per_thread, num_merge_items) # Get starting and ending thread coordinates (row, nnz) - thread_coord = merge_path_search(diagonal, nrow, length(nzval), row_end_offsets, nnz_indices) - thread_coord_end = merge_path_search(diagonal_end, nrow, length(nzval), row_end_offsets, nnz_indices) + thread_coord = merge_path_search(diagonal, nrows, length(nz), row_end_offsets, nnz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrows, length(nz), row_end_offsets, nnz_indices) # Consume merge items, whole rows first - running_total = zero(eltype(y)) + running_total = 0.0 while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] thread_coord.y += 1 end - output[thread_coord.x + 1] += running_total - running_total = zero(eltype(y)) + y[thread_coord.x + 1] = running_total * β + running_total = 0.0 thread_coord.x += 1 end # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β thread_coord.y += 1 end @@ -96,13 +132,9 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output: end - # Diagonistcs - # println("row_carry_out: $row_carry_out") - # println("value_carry_out: $value_carry_out") - # println("Incomplete y: $y") for tid in 1:num_threads - if row_carry_out[tid] <= nrow - output[row_carry_out[tid]] += value_carry_out[tid] + if row_carry_out[tid] <= nrows + y[row_carry_out[tid]] += value_carry_out[tid] end end @@ -117,8 +149,7 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) size(A, 2) == size(C, 1) || throw(DimensionMismatch()) size(A, 1) == size(B, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - # nzv = nonzeros(A) - # rv = rowvals(A) + # move multipication by beta into the multithreaded part if β != 1 β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) end @@ -127,10 +158,13 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) merge_csr_mv!(A, B[:,k], c, $t) end C + # end of @eval macro end + # end of for loop end - +#= +# Throws error about @atomic behavior function atomic_add(a::AbstractVector{T},b::T) where T <: Real @atomic a[1] += b end @@ -166,6 +200,7 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De end C end +=# - +# end of the module end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 42a40a2..32ab8cc 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -2,102 +2,95 @@ using Test using ParallelMergeCSR using SparseArrays -# works with AbstractSparseMatrixCSC or just plain AbstractMatrix -m_to_csr(M::AbstractMatrix) = M |> SparseArrays.sparse |> ParallelMergeCSR.SparseMatrixCSR @testset "Extreme Cases" begin @testset "Singleton" begin - full_m = reshape([1], 1, 1) - csr_m = m_to_csr(full_m) + A = sparse(reshape([1], 1, 1)) x = rand(1) - y = zeros(csr_m.m) + y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) - @test full_m * x == y + @test A * x == y end @testset "Single row" begin - csc_m = SparseArrays.sprand(10, 1, 0.3) - csr_m = m_to_csr(csc_m) + A = SparseArrays.sprand(10, 1, 0.3) x = rand(1:10, 1) - y = zeros(csr_m.m) + y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint) - @test csc_m * x ≈ y + @test A * x ≈ y end @testset "Single column" begin - csc_m = SparseArrays.sprand(10, 1, 0.3) - csr_m = m_to_csr(csc_m) + A = SparseArrays.sprand(10, 1, 0.3) x = rand(1:10, 1) - y = zeros(csr_m.m) + y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) - @test csc_m * x ≈ y + @test A * x ≈ y end end @testset "Square" begin # 10 x 10 with 30% chance of entry being made - full_m = SparseArrays.sprand(10,10,0.3) - csr_m = m_to_csr(full_m) + A = SparseArrays.sprand(10,10,0.3) - x = rand(1:10, 10, 1) + x = rand(10) - y = zeros(csr_m.m) + y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint) - @test full_m * x ≈ y + @test (A * x) * 1.1 ≈ y end @testset "4x6" begin # create matrix - full_m = [10 20 0 0 0 0; - 0 30 0 40 0 0; - 0 0 50 60 70 0; - 0 0 0 0 0 80] + m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] # get into CSR form - csr_m = m_to_csr(full_m) + A = sparse(m) # create vector x = [5,2,3,1,8,2] # create empty solution - y = zeros(Int64, csr_m.m) + y = zeros(Int64, size(A, 1)) # multiply - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint) - @test full_m * x == y + @test (A * x) * 2.0 == y end @testset "100x100" begin # create matrix - csc_m = SparseArrays.sprand(100, 100, 0.3) - csr_m = m_to_csr(csc_m) + A = SparseArrays.sprand(100, 100, 0.3) # create vector x = rand(1:100, 100, 1) # create empty solution - y = zeros(csr_m.m) + y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(csr_m, x, y) + ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose) - @test csc_m * x ≈ y + @test (A * x) * 3 ≈ y end \ No newline at end of file From eb4604f8845ddbccced801d81b10afa15f53adfc Mon Sep 17 00:00:00 2001 From: John Long Date: Thu, 29 Dec 2022 14:00:54 -0500 Subject: [PATCH 08/26] Added @inbounds, use nnz to denote number of nonzeros vs length(nz) --- src/ParallelMergeCSR.jl | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index e0caf3a..907a064 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -41,10 +41,6 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) end -# Ax = y -# A matrix -# x, y vectors - #= Algorithm originally designed for CSR now needs to work for CSC @@ -87,8 +83,9 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) row_end_offsets = At.colptr[2:end] nz = nonzeros(At) + nnz = length(nz) nrows = size(A, 1) - nnz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats + nz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix num_threads = nthreads() @@ -102,19 +99,19 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) diagonal = min(items_per_thread * (tid - 1), num_merge_items) diagonal_end = min(diagonal + items_per_thread, num_merge_items) - # Get starting and ending thread coordinates (row, nnz) - thread_coord = merge_path_search(diagonal, nrows, length(nz), row_end_offsets, nnz_indices) - thread_coord_end = merge_path_search(diagonal_end, nrows, length(nz), row_end_offsets, nnz_indices) + # Get starting and ending thread coordinates (row, nz) + thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) # Consume merge items, whole rows first running_total = 0.0 while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] + @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] thread_coord.y += 1 end - y[thread_coord.x + 1] = running_total * β + @inbounds y[thread_coord.x + 1] = running_total * β running_total = 0.0 thread_coord.x += 1 end @@ -122,7 +119,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β + @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β thread_coord.y += 1 end @@ -133,8 +130,8 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) end for tid in 1:num_threads - if row_carry_out[tid] <= nrows - y[row_carry_out[tid]] += value_carry_out[tid] + @inbounds if row_carry_out[tid] <= nrows + @inbounds y[row_carry_out[tid]] += value_carry_out[tid] end end From b62adb7da2d78671ab30d7b3db1fe8ca2f7f2ec7 Mon Sep 17 00:00:00 2001 From: John Long Date: Thu, 29 Dec 2022 16:48:28 -0500 Subject: [PATCH 09/26] Attempted fix for atomic ops, renamed args for merge_csr_mv, add tests --- Project.toml | 4 +- src/ParallelMergeCSR.jl | 61 +++++++++++++++---------- test/atomic_add!.jl | 27 +++++++++++ test/merge_csr_mv!.jl | 99 +++++++++++++++++++++++++++++++++++++++++ test/mul!.jl | 27 +++++++++++ test/runtests.jl | 95 +++------------------------------------ 6 files changed, 199 insertions(+), 114 deletions(-) create mode 100644 test/atomic_add!.jl create mode 100644 test/merge_csr_mv!.jl create mode 100644 test/mul!.jl diff --git a/Project.toml b/Project.toml index 5f34ef8..f1a521d 100644 --- a/Project.toml +++ b/Project.toml @@ -3,12 +3,12 @@ uuid = "c4b929cc-6141-4ba4-8849-280a863346e1" version = "0.1.0" [deps] +Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" - [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" [targets] -test = ["Test"] \ No newline at end of file +test = ["Test"] diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 907a064..b0692d9 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -7,6 +7,8 @@ using LinearAlgebra: Adjoint,adjoint,Transpose,transpose using Base.Threads +using Atomix + mutable struct Coordinate x::Int y::Int @@ -73,7 +75,7 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place # Axβ = y -function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) +function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op) # transpose the CSC to CSR ## colptr in CSC equiv. to rowptr in CSR @@ -107,11 +109,11 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) running_total = 0.0 while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] + @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] thread_coord.y += 1 end - @inbounds y[thread_coord.x + 1] = running_total * β + @inbounds output[thread_coord.x + 1] = running_total * β running_total = 0.0 thread_coord.x += 1 end @@ -119,7 +121,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β + @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] * β thread_coord.y += 1 end @@ -131,48 +133,62 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op) for tid in 1:num_threads @inbounds if row_carry_out[tid] <= nrows - @inbounds y[row_carry_out[tid]] += value_carry_out[tid] + @inbounds output[row_carry_out[tid]] += value_carry_out[tid] end end end -# y => C[:,k] -# x => B[:,k] +# C = adjoint(A)Bα + Cβ +# C = transpose(A)B + Cβ +# C = xABα + Cβ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + # obtains the original matrix underneath the "lazy wrapper" A = xA.parent size(A, 2) == size(C, 1) || throw(DimensionMismatch()) size(A, 1) == size(B, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - # move multipication by beta into the multithreaded part - if β != 1 - β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) - end - for (k,c) in enumerate(eachcol(C)) - # parallel implementation goes here acting on slice C[:,k], B[:,k] - merge_csr_mv!(A, B[:,k], c, $t) + # move multiplication by beta into the multithreaded part + ABα = similar(C) + for (row_idx, col) in enumerate(eachcol(B)) + # merge_csr_mv!(A, x, β, y, op) + merge_csr_mv!(A, col, α, ABα[row_idx, :], $t) end + C = ABα + C*β C # end of @eval macro end # end of for loop end -#= + # Throws error about @atomic behavior -function atomic_add(a::AbstractVector{T},b::T) where T <: Real - @atomic a[1] += b +## Probably meant to add b to every element of a while mutating a? +## function names should be suffixed w/ exclamation mark +function atomic_add!(a::AbstractVector{T},b::T) where T <: Real + # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes + # you'd have to wrap each + # Objective: for each element in AbstractVector, atomically add b to the element + # Can also use Atomix.jl + for idx in 1:length(a) + Atomix.@atomic a[idx] += b + end + end -function atomic_add(a::AbstractVector{T},b::T) where T <: Complex - view_a = reinterpret(real(T),a) +function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex + # return type representing real part + view_a = reinterpret(reshape, real(T),a) real_b = real(b) imag_b = imag(b) - @atomic view_a[1] += real_b - @atomic view_a[2] += imag_b + # mutating the view also mutates the underlying array it came from + for idx in 1:size(view_a, 2) + Atomix.@atomic view_a[1, idx] += real_b + Atomix.@atomic view_a[2, idx] += imag_b + end end function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) @@ -191,13 +207,12 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De @inbounds val = nzv[j]*αxj @inbounds row = rv[j] @inbounds out = C[row:row, k] # pass element as view - atomic_add(out,val) + atomic_add!(out,val) end end end C end -=# # end of the module end \ No newline at end of file diff --git a/test/atomic_add!.jl b/test/atomic_add!.jl new file mode 100644 index 0000000..5814afa --- /dev/null +++ b/test/atomic_add!.jl @@ -0,0 +1,27 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +@testset "atomic_add!" begin + # atomic add with real + @testset "Real" begin + a = rand(Float64, 20) + a_copy = deepcopy(a) + + b = 1.25 + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end + + # atomic add with complex + @testset "Complex" begin + a = rand(Complex{Float64}, 20) + a_copy = deepcopy(a) + + b = 2.5 + 0.8im + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end +end \ No newline at end of file diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl new file mode 100644 index 0000000..fecf2e8 --- /dev/null +++ b/test/merge_csr_mv!.jl @@ -0,0 +1,99 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +## NOTE: Sparse matrices are converted to dense form in the @test's +## considering that our redefinition of SparseArrays.mul! seems to +## interfere + +@testset "Extreme Cases" begin + + @testset "Singleton" begin + A = sparse(reshape([1], 1, 1)) + + x = rand(1) + + y = zeros(size(A, 1)) + + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) + + @test Matrix(A) * x == y + end + + @testset "Single row" begin + + A = SparseArrays.sprand(10, 1, 0.3) + + x = rand(1:10, 1) + + y = zeros(size(A, 1)) + + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint) + + @test Matrix(A) * x ≈ y + end + + @testset "Single column" begin + + A = SparseArrays.sprand(10, 1, 0.3) + + x = rand(1:10, 1) + + y = zeros(size(A, 1)) + + ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) + + @test Matrix(A) * x ≈ y + end +end + +@testset "Square" begin + # 10 x 10 with 30% chance of entry being made + A = SparseArrays.sprand(10,10,0.3) + + x = rand(10) + + y = zeros(size(A, 1)) + + ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint) + + @test (Matrix(A) * x) * 1.1 ≈ y + +end + +@testset "4x6" begin + # create matrix + m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] + + # get into CSR form + A = sparse(m) + + # create vector + x = [5,2,3,1,8,2] + + # create empty solution + y = zeros(Int64, size(A, 1)) + + # multiply + ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint) + + @test m * x * 2.0 == y +end + +@testset "100x100" begin + # create matrix + A = SparseArrays.sprand(100, 100, 0.3) + + # create vector + x = rand(1:100, 100, 1) + + # create empty solution + y = zeros(size(A, 1)) + + ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose) + + @test Matrix(A) * x * 3 ≈ y +end \ No newline at end of file diff --git a/test/mul!.jl b/test/mul!.jl new file mode 100644 index 0000000..5814afa --- /dev/null +++ b/test/mul!.jl @@ -0,0 +1,27 @@ +using Test +using ParallelMergeCSR +using SparseArrays + +@testset "atomic_add!" begin + # atomic add with real + @testset "Real" begin + a = rand(Float64, 20) + a_copy = deepcopy(a) + + b = 1.25 + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end + + # atomic add with complex + @testset "Complex" begin + a = rand(Complex{Float64}, 20) + a_copy = deepcopy(a) + + b = 2.5 + 0.8im + + ParallelMergeCSR.atomic_add!(a, b) + @test a == (a_copy .+ b) + end +end \ No newline at end of file diff --git a/test/runtests.jl b/test/runtests.jl index 32ab8cc..ecfe12b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,96 +1,13 @@ using Test -using ParallelMergeCSR -using SparseArrays - - -@testset "Extreme Cases" begin - - @testset "Singleton" begin - A = sparse(reshape([1], 1, 1)) - - x = rand(1) - - y = zeros(size(A, 1)) - - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) - - @test A * x == y - end - - @testset "Single row" begin - - A = SparseArrays.sprand(10, 1, 0.3) - - x = rand(1:10, 1) - - y = zeros(size(A, 1)) - - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint) - - @test A * x ≈ y - end - - @testset "Single column" begin - - A = SparseArrays.sprand(10, 1, 0.3) - - x = rand(1:10, 1) - - y = zeros(size(A, 1)) - - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) - - @test A * x ≈ y - end -end - -@testset "Square" begin - # 10 x 10 with 30% chance of entry being made - A = SparseArrays.sprand(10,10,0.3) - - x = rand(10) - - y = zeros(size(A, 1)) - - ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint) - - @test (A * x) * 1.1 ≈ y +@testset "merge_csr_mv!" begin + include("merge_csr_mv!.jl") end -@testset "4x6" begin - # create matrix - m = [10 20 0 0 0 0; - 0 30 0 40 0 0; - 0 0 50 60 70 0; - 0 0 0 0 0 80] - - # get into CSR form - A = sparse(m) - - # create vector - x = [5,2,3,1,8,2] - - # create empty solution - y = zeros(Int64, size(A, 1)) - - # multiply - ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint) - - @test (A * x) * 2.0 == y +@testset "atomic_add!" begin + include("atomic_add!.jl") end -@testset "100x100" begin - # create matrix - A = SparseArrays.sprand(100, 100, 0.3) - - # create vector - x = rand(1:100, 100, 1) - - # create empty solution - y = zeros(size(A, 1)) - - ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose) - - @test (A * x) * 3 ≈ y +@testset "mul!" begin + include("mul!.jl") end \ No newline at end of file From f7f46b206d3897b3a182b3305c2cea879d19a120 Mon Sep 17 00:00:00 2001 From: John Long Date: Thu, 29 Dec 2022 17:57:15 -0500 Subject: [PATCH 10/26] Fixed broken default matrix-matrix, matrix-vector mul, added simple tests --- src/ParallelMergeCSR.jl | 26 +++++++++++++------ test/mul!.jl | 57 +++++++++++++++++++++++++++++------------ 2 files changed, 58 insertions(+), 25 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index b0692d9..6360272 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -150,8 +150,14 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) size(A, 2) == size(C, 1) || throw(DimensionMismatch()) size(A, 1) == size(B, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - # move multiplication by beta into the multithreaded part - ABα = similar(C) + if β != 1 + # assume the rmul! intended to come from SparseArrays + ## (could implement a threaded version of that as well, maybe piggyback off of impl branch) + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) + end + # move multiplication by alpha into the multithreaded part + ABα = zeros(C) for (row_idx, col) in enumerate(eachcol(B)) # merge_csr_mv!(A, x, β, y, op) merge_csr_mv!(A, col, α, ABα[row_idx, :], $t) @@ -164,12 +170,10 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) end -# Throws error about @atomic behavior -## Probably meant to add b to every element of a while mutating a? ## function names should be suffixed w/ exclamation mark function atomic_add!(a::AbstractVector{T},b::T) where T <: Real # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes - # you'd have to wrap each + # you'd have to wrap each element of the array with Atomic # Objective: for each element in AbstractVector, atomically add b to the element # Can also use Atomix.jl for idx in 1:length(a) @@ -198,15 +202,21 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De nzv = nonzeros(A) rv = rowvals(A) if β != 1 - β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C))) + # assume the rmul! intended to come from SparseArrays + ## (could implement a threaded version of that as well, maybe piggyback off of impl branch) + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) end + # iterate over the columns of C for k in 1:size(C, 2) + # iterate over columns of A @threads for col in 1:size(A, 2) + # multiply each element of B times alpha (single value) @inbounds αxj = B[col,k] * α - for j in nzrange(A, col) + for j in nzrange(A, col) # range of indices in nzv of A restricted to column @inbounds val = nzv[j]*αxj @inbounds row = rv[j] - @inbounds out = C[row:row, k] # pass element as view + @inbounds out = @view C[row:row, k] atomic_add!(out,val) end end diff --git a/test/mul!.jl b/test/mul!.jl index 5814afa..e7b27d7 100644 --- a/test/mul!.jl +++ b/test/mul!.jl @@ -2,26 +2,49 @@ using Test using ParallelMergeCSR using SparseArrays -@testset "atomic_add!" begin - # atomic add with real - @testset "Real" begin - a = rand(Float64, 20) - a_copy = deepcopy(a) +# trigger merge_csr_mv! +@testset "Adjoint" begin + + # C = ABα + Cβ +end - b = 1.25 +# trigger merge_csr_mv! +@testset "Transpose" begin - ParallelMergeCSR.atomic_add!(a, b) - @test a == (a_copy .+ b) - end + # C = ABα + Cβ +end - # atomic add with complex - @testset "Complex" begin - a = rand(Complex{Float64}, 20) - a_copy = deepcopy(a) +@testset "Matrix x Matrix" begin - b = 2.5 + 0.8im + # C = ABα + Cβ + ## A should be CSC matrix, B should be Dense + ## α, β are `Number`s + A = sprand(5, 5, 0.2) + B = rand(5, 5) + α = 1.1 + + C = zeros(Float64, 5, 5) + C_copy = deepcopy(C) + β = 0.3 + + SparseArrays.mul!(C, A, B, α, β) + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Vector" begin + + A = sprand(5, 5, 0.3) + B = rand(5) + α = 2.5 + + C = zeros(Float64, size(B)) + C_copy = deepcopy(C) + β = 5.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + - ParallelMergeCSR.atomic_add!(a, b) - @test a == (a_copy .+ b) - end end \ No newline at end of file From 45478d1360840d66d72260c4836a149fb53404ab Mon Sep 17 00:00:00 2001 From: John Long Date: Fri, 30 Dec 2022 11:54:32 -0500 Subject: [PATCH 11/26] Added test for matrix x matrix operation using merge_csr_mv! --- test/merge_csr_mv!.jl | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index fecf2e8..241a258 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -96,4 +96,28 @@ end ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose) @test Matrix(A) * x * 3 ≈ y +end + +@testset "Matrix x Matrix" begin + + ## Calculate AXα + # Create Matrices + A = SparseArrays.sparse(rand(1:10, 3, 2)) + X = rand(1:10, 2, 4) + + # set alpha to 1.0 for now + α = 1.0 + + # Create place to store solution + Y = zeros(3, 4) + + # iterate + for (idx, col) in enumerate(eachcol(X)) + # merge_csr_mv!(A, x, β, y, op) + Y_view = @view Y[:, idx] + ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x) + end + + @test Matrix(A) * X == Y + end \ No newline at end of file From 381437f2bc0a3b592895251f3e77e543540477fc Mon Sep 17 00:00:00 2001 From: John Long Date: Fri, 30 Dec 2022 11:55:29 -0500 Subject: [PATCH 12/26] Fixed issue with zeros, partial fix for AB\alpha operation inside mul! not mutating --- src/ParallelMergeCSR.jl | 10 ++++++---- test/mul!.jl | 28 +++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 6360272..59d82c8 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -157,12 +157,14 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) end # move multiplication by alpha into the multithreaded part - ABα = zeros(C) - for (row_idx, col) in enumerate(eachcol(B)) + ABα = zeros(size(C)) + for (col_idx, col) in enumerate(eachcol(B)) # merge_csr_mv!(A, x, β, y, op) - merge_csr_mv!(A, col, α, ABα[row_idx, :], $t) + ABα_view = @view ABα[:, col_idx] + merge_csr_mv!(A, col, α, ABα_view, $t) end - C = ABα + C*β + # doesn't seem to mutate properly + C += ABα C # end of @eval macro end diff --git a/test/mul!.jl b/test/mul!.jl index e7b27d7..4520170 100644 --- a/test/mul!.jl +++ b/test/mul!.jl @@ -5,13 +5,35 @@ using SparseArrays # trigger merge_csr_mv! @testset "Adjoint" begin - # C = ABα + Cβ + # C = adjoint(A)Bα + Cβ + A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3)) + B = rand(5,5) + α = 2.3 + + C = rand(Complex{Float64}, 5, 5) + C_copy = deepcopy(C) + β = 1.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β end # trigger merge_csr_mv! @testset "Transpose" begin - # C = ABα + Cβ + # C = transpose(A)Bα + Cβ + A = transpose(sprand(4, 4, 0.3)) + B = rand(1:10, (4,4)) + α = 1.0 + + C = zeros(Float64, 4, 4) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β end @testset "Matrix x Matrix" begin @@ -23,7 +45,7 @@ end B = rand(5, 5) α = 1.1 - C = zeros(Float64, 5, 5) + C = zeros(5, 5) C_copy = deepcopy(C) β = 0.3 From 84393bcb4b7b39d9bb60af9b775c4c855b8b0b13 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Tue, 3 Jan 2023 10:41:02 -0500 Subject: [PATCH 13/26] fixing issue with mul! --- src/ParallelMergeCSR.jl | 55 +++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 24 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 59d82c8..11645e8 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -2,7 +2,11 @@ module ParallelMergeCSR using SparseArrays using SparseArrays: AbstractSparseMatrixCSC, - DenseInputVecOrMat + DenseInputVecOrMat, + getcolptr, + getrowval, + getnzval + using LinearAlgebra: Adjoint,adjoint,Transpose,transpose using Base.Threads @@ -75,53 +79,59 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place # Axβ = y -function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op) +function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op) # transpose the CSC to CSR ## colptr in CSC equiv. to rowptr in CSR ## rowval in CSC equiv. to colval in CSR ## rows are now columns so the m x n dimensions after transpose are n x m - At = copy(transpose(A)) - row_end_offsets = At.colptr[2:end] - nz = nonzeros(At) - nnz = length(nz) - nrows = size(A, 1) - nz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats - num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix + # At = copy(transpose(A)) + # row_end_offsets = At.colptr[2:end] + + nzv = getnzval(A) + rv = getrowval(A) + cp = getcolptr(A) + + nnz = length(nzv) + nrows = length(cp) - 1 + + nz_indices = rv + row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats + num_merge_items = length(nzv) + nrows # preserve the dimensions of the original matrix num_threads = nthreads() items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads - row_carry_out = zeros(Int, num_threads) - value_carry_out = zeros(Float64, num_threads) + row_carry_out = zeros(eltype(cp), num_threads) + value_carry_out = zeros(eltype(output), num_threads) # value must match output # Julia threads start id by 1, so make sure to offset! @threads for tid in 1:num_threads diagonal = min(items_per_thread * (tid - 1), num_merge_items) diagonal_end = min(diagonal + items_per_thread, num_merge_items) - # Get starting and ending thread coordinates (row, nz) + # Get starting and ending thread coordinates (row, nzv) thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) # Consume merge items, whole rows first - running_total = 0.0 + running_total = 0 while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end - @inbounds output[thread_coord.x + 1] = running_total * β - running_total = 0.0 + @inbounds output[thread_coord.x + 1] += α * running_total + running_total = 0 thread_coord.x += 1 end # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] * β + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end @@ -133,7 +143,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op) for tid in 1:num_threads @inbounds if row_carry_out[tid] <= nrows - @inbounds output[row_carry_out[tid]] += value_carry_out[tid] + @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid] end end @@ -157,14 +167,11 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) end # move multiplication by alpha into the multithreaded part - ABα = zeros(size(C)) - for (col_idx, col) in enumerate(eachcol(B)) + for (col_idx, input) in enumerate(eachcol(B)) # merge_csr_mv!(A, x, β, y, op) - ABα_view = @view ABα[:, col_idx] - merge_csr_mv!(A, col, α, ABα_view, $t) + output = @view C[:, col_idx] + merge_csr_mv!(α, A, input, output, $t) end - # doesn't seem to mutate properly - C += ABα C # end of @eval macro end From 1a89711c16be2c93118d0dcb4a2d249abdd7e261 Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 10:47:51 -0500 Subject: [PATCH 14/26] Try to use @view to fix issue with matrix not being mutated --- src/ParallelMergeCSR.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 59d82c8..5c3dc0e 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -164,7 +164,7 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) merge_csr_mv!(A, col, α, ABα_view, $t) end # doesn't seem to mutate properly - C += ABα + C[:,:] += ABα C # end of @eval macro end From b6c26ccbefa85cc52f12c78028e2a6aa9a6f7e90 Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 10:50:42 -0500 Subject: [PATCH 15/26] Add tests for transpose on square and rectangular matrices --- test/mul!.jl | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/test/mul!.jl b/test/mul!.jl index 4520170..16e3683 100644 --- a/test/mul!.jl +++ b/test/mul!.jl @@ -20,14 +20,31 @@ using SparseArrays end # trigger merge_csr_mv! -@testset "Transpose" begin +@testset "Transpose Square" begin # C = transpose(A)Bα + Cβ - A = transpose(sprand(4, 4, 0.3)) - B = rand(1:10, (4,4)) + A = transpose(sparse(rand(1:5, 4, 4))) + B = rand(1:5, (4,4)) α = 1.0 - C = zeros(Float64, 4, 4) + C = zeros(4, 4) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + # right hand side is correct, left hand side is problematic + @test C ≈ Matrix(A) * B * α + C_copy * β +end + +@testset "Transpose Rectangular" begin + + A = sparse([1 2 3 4; 5 6 7 8]) |> transpose + B = [1 2 3; 4 5 6] + + α = 1.0 + + C = zeros(4, 3) C_copy = deepcopy(C) β = 1.0 From 182aaa53967e114480e2fbdab0a2a452eefae32c Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 11:41:11 -0500 Subject: [PATCH 16/26] Changed unit tests to match new argument ordering --- test/merge_csr_mv!.jl | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 241a258..b01c67f 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -6,6 +6,9 @@ using SparseArrays ## considering that our redefinition of SparseArrays.mul! seems to ## interfere +## New function signature for merge_csr_mv!, it is now +## merge_csr_mv!(α, A, input, output, op) +## so should be A*input*α = output with some op @testset "Extreme Cases" begin @testset "Singleton" begin @@ -15,7 +18,7 @@ using SparseArrays y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) + ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose) @test Matrix(A) * x == y end @@ -28,7 +31,7 @@ using SparseArrays y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint) + ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, adjoint) @test Matrix(A) * x ≈ y end @@ -41,7 +44,7 @@ using SparseArrays y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose) + ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose) @test Matrix(A) * x ≈ y end @@ -55,7 +58,7 @@ end y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint) + ParallelMergeCSR.merge_csr_mv!(1.1, A, x, y, adjoint) @test (Matrix(A) * x) * 1.1 ≈ y @@ -78,7 +81,8 @@ end y = zeros(Int64, size(A, 1)) # multiply - ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint) + ParallelMergeCSR.merge_csr_mv!(2.0, A, x, y, adjoint) + @test m * x * 2.0 == y end @@ -93,7 +97,7 @@ end # create empty solution y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose) + ParallelMergeCSR.merge_csr_mv!(3, A, x, y, transpose) @test Matrix(A) * x * 3 ≈ y end @@ -115,7 +119,8 @@ end for (idx, col) in enumerate(eachcol(X)) # merge_csr_mv!(A, x, β, y, op) Y_view = @view Y[:, idx] - ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x) + ParallelMergeCSR.merge_csr_mv!(α, A, col, Y_view, x -> x) + # ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x) end @test Matrix(A) * X == Y From 35af5feef9797f6c8ff51faa9f70371a65dbbcda Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Tue, 3 Jan 2023 12:06:50 -0500 Subject: [PATCH 17/26] adding StridedVector type --- src/ParallelMergeCSR.jl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 11645e8..17900c6 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -79,7 +79,7 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place # Axβ = y -function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op) +function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) # transpose the CSC to CSR ## colptr in CSC equiv. to rowptr in CSR @@ -116,7 +116,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op) thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) # Consume merge items, whole rows first - running_total = 0 + running_total = zero(eltype(output)) while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] @@ -124,7 +124,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op) end @inbounds output[thread_coord.x + 1] += α * running_total - running_total = 0 + running_total = zero(eltype(output)) thread_coord.x += 1 end From 8cd636bc4956ddb883386e78a974a93aec1ac07f Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 14:40:45 -0500 Subject: [PATCH 18/26] fixed segfault, revised some tests to adjoint/transpose matrix v. plain matrix --- src/ParallelMergeCSR.jl | 4 +++- test/merge_csr_mv!.jl | 39 ++++++++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 16 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 11645e8..0026f4c 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -94,7 +94,9 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op) cp = getcolptr(A) nnz = length(nzv) - nrows = length(cp) - 1 + + # nrows = length(cp) - 1 can give the wrong number of rows! + nrows = A.m nz_indices = rv row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index b01c67f..7ae8fc6 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -18,49 +18,58 @@ using SparseArrays y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose) + ParallelMergeCSR.merge_csr_mv!(0.3, A, x, y, transpose) - @test Matrix(A) * x == y + @test Matrix(A) * x * 0.3 == y end @testset "Single row" begin + # 10 x 1 converted to 1 x 10 A = SparseArrays.sprand(10, 1, 0.3) + A_transpose = copy(transpose(A)) - x = rand(1:10, 1) + # x needs to be 10 x 1 + x = rand(size(A, 1)) - y = zeros(size(A, 1)) + y = zeros(eltype(x), 1) - ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, adjoint) + ParallelMergeCSR.merge_csr_mv!(1.1, A_transpose, x, y, transpose) - @test Matrix(A) * x ≈ y + @test (transpose(Matrix(A)) * x) * 1.1 ≈ y end @testset "Single column" begin - A = SparseArrays.sprand(10, 1, 0.3) + # 1 x 10 is now 10 x 1 + A = SparseArrays.sprand(1, 10, 0.3) + A_adjoint = copy(adjoint(A)) - x = rand(1:10, 1) + # needs to be 1 x 1 + x = rand() - y = zeros(size(A, 1)) + # 10 x 1 + y = zeros(size(A_adjoint)) - ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose) + ParallelMergeCSR.merge_csr_mv!(0.7, A_adjoint, x, y, adjoint) - @test Matrix(A) * x ≈ y + @test adjoint(Matrix(A)) * x * 0.7 ≈ y end end @testset "Square" begin # 10 x 10 with 30% chance of entry being made A = SparseArrays.sprand(10,10,0.3) + A_adjoint = copy(adjoint(A)) + # 10 x 1 x = rand(10) - y = zeros(size(A, 1)) + y = zeros(size(x)) - ParallelMergeCSR.merge_csr_mv!(1.1, A, x, y, adjoint) + ParallelMergeCSR.merge_csr_mv!(1.1, A_adjoint, x, y, adjoint) - @test (Matrix(A) * x) * 1.1 ≈ y + @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y end @@ -71,7 +80,7 @@ end 0 0 50 60 70 0; 0 0 0 0 0 80] - # get into CSR form + # get into CSC form A = sparse(m) # create vector From 34eda6b3f482be1c577a87b6b0414235146480ad Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Tue, 3 Jan 2023 14:49:20 -0500 Subject: [PATCH 19/26] fixing imports --- src/ParallelMergeCSR.jl | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index a8871d8..f1a17ba 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -3,9 +3,7 @@ module ParallelMergeCSR using SparseArrays using SparseArrays: AbstractSparseMatrixCSC, DenseInputVecOrMat, - getcolptr, - getrowval, - getnzval + getcolptr using LinearAlgebra: Adjoint,adjoint,Transpose,transpose @@ -89,8 +87,8 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # At = copy(transpose(A)) # row_end_offsets = At.colptr[2:end] - nzv = getnzval(A) - rv = getrowval(A) + nzv = nonzeros(A) + rv = rowvals(A) cp = getcolptr(A) nnz = length(nzv) From 366d2e8e56dbf6ab8d048da1359c2f860036abd8 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Tue, 3 Jan 2023 15:44:41 -0500 Subject: [PATCH 20/26] rm inbounds no longer overloading SparseArray.mul! Co-authored-by: John Long --- src/ParallelMergeCSR.jl | 12 +++++++----- test/merge_csr_mv!.jl | 38 +++++++++++++++++++------------------- 2 files changed, 26 insertions(+), 24 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index f1a17ba..9b7908b 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -94,7 +94,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect nnz = length(nzv) # nrows = length(cp) - 1 can give the wrong number of rows! - nrows = A.m + nrows = A.n nz_indices = rv row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats @@ -119,11 +119,13 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect running_total = zero(eltype(output)) while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + # @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end - @inbounds output[thread_coord.x + 1] += α * running_total + # @inbounds output[thread_coord.x + 1] += α * running_total + output[thread_coord.x + 1] += α * running_total running_total = zero(eltype(output)) thread_coord.x += 1 end @@ -154,7 +156,7 @@ end # C = transpose(A)B + Cβ # C = xABα + Cβ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) - @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) # obtains the original matrix underneath the "lazy wrapper" A = xA.parent size(A, 2) == size(C, 1) || throw(DimensionMismatch()) @@ -204,7 +206,7 @@ function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex end end -function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) +function mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) size(A, 2) == size(B, 1) || throw(DimensionMismatch()) size(A, 1) == size(C, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 7ae8fc6..42117da 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -1,5 +1,5 @@ using Test -using ParallelMergeCSR +using ParallelMergeCSR: merge_csr_mv! using SparseArrays ## NOTE: Sparse matrices are converted to dense form in the @test's @@ -18,7 +18,7 @@ using SparseArrays y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(0.3, A, x, y, transpose) + merge_csr_mv!(0.3, A, x, y, transpose) @test Matrix(A) * x * 0.3 == y end @@ -27,14 +27,13 @@ using SparseArrays # 10 x 1 converted to 1 x 10 A = SparseArrays.sprand(10, 1, 0.3) - A_transpose = copy(transpose(A)) # x needs to be 10 x 1 x = rand(size(A, 1)) y = zeros(eltype(x), 1) - ParallelMergeCSR.merge_csr_mv!(1.1, A_transpose, x, y, transpose) + merge_csr_mv!(1.1, A, x, y, transpose) @test (transpose(Matrix(A)) * x) * 1.1 ≈ y end @@ -43,31 +42,31 @@ using SparseArrays # 1 x 10 is now 10 x 1 A = SparseArrays.sprand(1, 10, 0.3) - A_adjoint = copy(adjoint(A)) - + # needs to be 1 x 1 - x = rand() + x = rand(1) # 10 x 1 - y = zeros(size(A_adjoint)) + y = zeros(size(A,2)) + + merge_csr_mv!(0.7, A, x, y, adjoint) - ParallelMergeCSR.merge_csr_mv!(0.7, A_adjoint, x, y, adjoint) + y_exact = 0.7 * adjoint(Matrix(A)) * x - @test adjoint(Matrix(A)) * x * 0.7 ≈ y + @test y_exact ≈ y end end @testset "Square" begin # 10 x 10 with 30% chance of entry being made A = SparseArrays.sprand(10,10,0.3) - A_adjoint = copy(adjoint(A)) # 10 x 1 x = rand(10) y = zeros(size(x)) - ParallelMergeCSR.merge_csr_mv!(1.1, A_adjoint, x, y, adjoint) + merge_csr_mv!(1.1, A, x, y, adjoint) @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y @@ -84,16 +83,17 @@ end A = sparse(m) # create vector - x = [5,2,3,1,8,2] + # x = [5,2,3,1,8,2] + x = [5,2,3,1] # create empty solution - y = zeros(Int64, size(A, 1)) + y = zeros(Int64, size(A, 2)) # multiply - ParallelMergeCSR.merge_csr_mv!(2.0, A, x, y, adjoint) + merge_csr_mv!(2.0, A, x, y, adjoint) - @test m * x * 2.0 == y + @test transpose(m) * x * 2.0 == y end @testset "100x100" begin @@ -106,7 +106,7 @@ end # create empty solution y = zeros(size(A, 1)) - ParallelMergeCSR.merge_csr_mv!(3, A, x, y, transpose) + merge_csr_mv!(3, A, x, y, transpose) @test Matrix(A) * x * 3 ≈ y end @@ -128,8 +128,8 @@ end for (idx, col) in enumerate(eachcol(X)) # merge_csr_mv!(A, x, β, y, op) Y_view = @view Y[:, idx] - ParallelMergeCSR.merge_csr_mv!(α, A, col, Y_view, x -> x) - # ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x) + merge_csr_mv!(α, A, col, Y_view, x -> x) + # merge_csr_mv!(A, col, α, Y_view, x -> x) end @test Matrix(A) * X == Y From 778d524cf8c6b7d90f68425a4f60331a2910d18b Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 17:48:35 -0500 Subject: [PATCH 21/26] Fixed issue with incorrect output and updated tests to always use transposed/adjointed matrices --- src/ParallelMergeCSR.jl | 11 +++-------- test/merge_csr_mv!.jl | 34 +++++++++++++++++----------------- 2 files changed, 20 insertions(+), 25 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 9b7908b..416bd91 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -84,9 +84,6 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect ## rowval in CSC equiv. to colval in CSR ## rows are now columns so the m x n dimensions after transpose are n x m - # At = copy(transpose(A)) - # row_end_offsets = At.colptr[2:end] - nzv = nonzeros(A) rv = rowvals(A) cp = getcolptr(A) @@ -96,9 +93,9 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # nrows = length(cp) - 1 can give the wrong number of rows! nrows = A.n - nz_indices = rv + nz_indices = collect(1:nnz) row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats - num_merge_items = length(nzv) + nrows # preserve the dimensions of the original matrix + num_merge_items = nnz + nrows # preserve the dimensions of the original matrix num_threads = nthreads() items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads @@ -119,12 +116,10 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect running_total = zero(eltype(output)) while thread_coord.x < thread_coord_end.x while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - # @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end - # @inbounds output[thread_coord.x + 1] += α * running_total output[thread_coord.x + 1] += α * running_total running_total = zero(eltype(output)) thread_coord.x += 1 @@ -133,7 +128,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 42117da..5fc5ec9 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -5,10 +5,6 @@ using SparseArrays ## NOTE: Sparse matrices are converted to dense form in the @test's ## considering that our redefinition of SparseArrays.mul! seems to ## interfere - -## New function signature for merge_csr_mv!, it is now -## merge_csr_mv!(α, A, input, output, op) -## so should be A*input*α = output with some op @testset "Extreme Cases" begin @testset "Singleton" begin @@ -57,6 +53,7 @@ using SparseArrays end end +# test fails, y seems to have lots of zero-entries @testset "Square" begin # 10 x 10 with 30% chance of entry being made A = SparseArrays.sprand(10,10,0.3) @@ -64,6 +61,7 @@ end # 10 x 1 x = rand(10) + # 10 x 1 y = zeros(size(x)) merge_csr_mv!(1.1, A, x, y, adjoint) @@ -93,7 +91,7 @@ end merge_csr_mv!(2.0, A, x, y, adjoint) - @test transpose(m) * x * 2.0 == y + @test Matrix(adjoint(m)) * x * 2.0 == y end @testset "100x100" begin @@ -101,37 +99,39 @@ end A = SparseArrays.sprand(100, 100, 0.3) # create vector - x = rand(1:100, 100, 1) + x = rand(100) # create empty solution y = zeros(size(A, 1)) - merge_csr_mv!(3, A, x, y, transpose) + merge_csr_mv!(3.0, A, x, y, transpose) - @test Matrix(A) * x * 3 ≈ y + @test transpose(Matrix(A)) * x * 3 ≈ y end @testset "Matrix x Matrix" begin ## Calculate AXα # Create Matrices - A = SparseArrays.sparse(rand(1:10, 3, 2)) - X = rand(1:10, 2, 4) + # 3 x 2 + A = sprand(3, 2, 0.4) + # 3 x 4 + X = rand(3, 4) # set alpha to 1.0 for now - α = 1.0 + α = 1.0 - # Create place to store solution - Y = zeros(3, 4) + # Create place to store solution, + # should be 2 x 4 + Y = zeros(2, 4) # iterate + # some kind of indexing issue going on here for (idx, col) in enumerate(eachcol(X)) - # merge_csr_mv!(A, x, β, y, op) Y_view = @view Y[:, idx] - merge_csr_mv!(α, A, col, Y_view, x -> x) - # merge_csr_mv!(A, col, α, Y_view, x -> x) + merge_csr_mv!(α, A, col, Y_view, transpose) end - @test Matrix(A) * X == Y + @test transpose(Matrix(A)) * X * 1.0 ≈ Y end \ No newline at end of file From 938ecd76162696f50fcbb3ebc10d7a3da75dec6e Mon Sep 17 00:00:00 2001 From: John Long Date: Tue, 3 Jan 2023 18:12:54 -0500 Subject: [PATCH 22/26] Add @inbounds --- src/ParallelMergeCSR.jl | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 416bd91..06c9ef6 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -115,12 +115,12 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # Consume merge items, whole rows first running_total = zero(eltype(output)) while thread_coord.x < thread_coord_end.x - while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end - output[thread_coord.x + 1] += α * running_total + @inbounds output[thread_coord.x + 1] += α * running_total running_total = zero(eltype(output)) thread_coord.x += 1 end @@ -128,13 +128,13 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # May have thread end up partially consuming a row. # Save result form partial consumption and do one pass at the end to add it back to y while thread_coord.y < thread_coord_end.y - running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end # Save carry-outs - row_carry_out[tid] = thread_coord_end.x + 1 - value_carry_out[tid] = running_total + @inbounds row_carry_out[tid] = thread_coord_end.x + 1 + @inbounds value_carry_out[tid] = running_total end From c9b8cccc72dd8a47374ee6f96d70910daeef0145 Mon Sep 17 00:00:00 2001 From: John Long Date: Wed, 4 Jan 2023 10:24:29 -0500 Subject: [PATCH 23/26] Removed unnecessary comments, added SparseArrays definition override --- src/ParallelMergeCSR.jl | 54 ++++------------------------------------- 1 file changed, 5 insertions(+), 49 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 06c9ef6..5e777bb 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -45,45 +45,10 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) end -#= -Algorithm originally designed for CSR now needs to work for CSC - -CSC has fields: -* m - number of rows -* n - number of columns -* colptr::Vector - column j is in colptr[j]:(colptr[j+1]-1) -* rowval::Vector - row indices of stored values -* nzval::Vector - stored values, typically nonzeros - -CSR used to have fields: -* m - number of rows -* n - number of columns -* rowptr::Vector - row i is in rowptr[i]:(rowptr[i+1]-1) -* colval::Vector - col indices of stored values -* nzval::Vector - Stored values, typically non-zeros - -Given a matrix A in CSC, if you do transpose(A), indexing -individual elements is fine but it's a lazy transpose. -To materialize the changes, probably requires you perform some copy operation -e.g. can see changes via dump(copy(transpose(m))) where m isa AbstractMatrixCSC - -* NOTE: we don't want the transpose to actually change the matrix dimensions, -we do it just so the INTERNAL representation of the matrix looks like a CSR -=# - - -# TODO: -# 1. add @inbounds to remove bounds check -# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix -# StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place -# Axβ = y +# Internally treats the matrix as if its been transposed/is an adjoint +# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) - # transpose the CSC to CSR - ## colptr in CSC equiv. to rowptr in CSR - ## rowval in CSC equiv. to colval in CSR - ## rows are now columns so the m x n dimensions after transpose are n x m - nzv = nonzeros(A) rv = rowvals(A) cp = getcolptr(A) @@ -151,21 +116,17 @@ end # C = transpose(A)B + Cβ # C = xABα + Cβ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) - @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) # obtains the original matrix underneath the "lazy wrapper" A = xA.parent size(A, 2) == size(C, 1) || throw(DimensionMismatch()) size(A, 1) == size(B, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) if β != 1 - # assume the rmul! intended to come from SparseArrays - ## (could implement a threaded version of that as well, maybe piggyback off of impl branch) # preemptively handle β so we just handle C = ABα + C β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) end - # move multiplication by alpha into the multithreaded part for (col_idx, input) in enumerate(eachcol(B)) - # merge_csr_mv!(A, x, β, y, op) output = @view C[:, col_idx] merge_csr_mv!(α, A, input, output, $t) end @@ -201,25 +162,20 @@ function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex end end -function mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) +function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) size(A, 2) == size(B, 1) || throw(DimensionMismatch()) size(A, 1) == size(C, 1) || throw(DimensionMismatch()) size(B, 2) == size(C, 2) || throw(DimensionMismatch()) nzv = nonzeros(A) rv = rowvals(A) if β != 1 - # assume the rmul! intended to come from SparseArrays - ## (could implement a threaded version of that as well, maybe piggyback off of impl branch) # preemptively handle β so we just handle C = ABα + C β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) end - # iterate over the columns of C for k in 1:size(C, 2) - # iterate over columns of A @threads for col in 1:size(A, 2) - # multiply each element of B times alpha (single value) @inbounds αxj = B[col,k] * α - for j in nzrange(A, col) # range of indices in nzv of A restricted to column + for j in nzrange(A, col) @inbounds val = nzv[j]*αxj @inbounds row = rv[j] @inbounds out = @view C[row:row, k] From 5217a3cdf43aa5dd5c088f45838c54b578cfea95 Mon Sep 17 00:00:00 2001 From: John Long Date: Wed, 4 Jan 2023 12:33:55 -0500 Subject: [PATCH 24/26] Reordered imports to alphabetical order --- src/ParallelMergeCSR.jl | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 5e777bb..1a74996 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -1,16 +1,13 @@ module ParallelMergeCSR +using Atomix +using Base.Threads using SparseArrays +using LinearAlgebra: Adjoint,adjoint,Transpose,transpose using SparseArrays: AbstractSparseMatrixCSC, DenseInputVecOrMat, getcolptr -using LinearAlgebra: Adjoint,adjoint,Transpose,transpose - -using Base.Threads - -using Atomix - mutable struct Coordinate x::Int y::Int From b033e3dcbc9f57cda21b6d82c4e0e18e407097e5 Mon Sep 17 00:00:00 2001 From: John Long Date: Wed, 4 Jan 2023 12:34:39 -0500 Subject: [PATCH 25/26] added complex-valued unit tests for merge_csr_mv, removed incompatible tests --- test/merge_csr_mv!.jl | 152 +++++++++++++++++++++++++++++++++--------- 1 file changed, 120 insertions(+), 32 deletions(-) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 5fc5ec9..0eda52c 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -7,7 +7,7 @@ using SparseArrays ## interfere @testset "Extreme Cases" begin - @testset "Singleton" begin + @testset "Singleton (Real)" begin A = sparse(reshape([1], 1, 1)) x = rand(1) @@ -19,43 +19,63 @@ using SparseArrays @test Matrix(A) * x * 0.3 == y end - @testset "Single row" begin + @testset "Singleton (Complex)" begin + A = sparse(reshape([1.0+2.5im], 1, 1)) + + x = rand(1) + + y = zeros(eltype(A), size(A, 1)) + + merge_csr_mv!(10.1, A, x, y, adjoint) + + @test adjoint(Matrix(A)) * x * 10.1 == y + end + + @testset "Single row (Real)" begin # 10 x 1 converted to 1 x 10 - A = SparseArrays.sprand(10, 1, 0.3) + A = 10.0 * SparseArrays.sprand(10, 1, 0.3) # x needs to be 10 x 1 x = rand(size(A, 1)) - y = zeros(eltype(x), 1) + y = zeros(eltype(A), 1) merge_csr_mv!(1.1, A, x, y, transpose) @test (transpose(Matrix(A)) * x) * 1.1 ≈ y end - @testset "Single column" begin + @testset "Single row (Complex)" begin - # 1 x 10 is now 10 x 1 - A = SparseArrays.sprand(1, 10, 0.3) - - # needs to be 1 x 1 - x = rand(1) + # 10 x 1 is treated as 1 x 10 inside merge_csr_mv! + A = SparseArrays.sprand(Complex{Float64}, 10, 1, 0.3) - # 10 x 1 - y = zeros(size(A,2)) - - merge_csr_mv!(0.7, A, x, y, adjoint) + x = rand(eltype(A), size(A, 1)) - y_exact = 0.7 * adjoint(Matrix(A)) * x + y = zeros(eltype(A), 1) - @test y_exact ≈ y + merge_csr_mv!(1.1, A, x, y, transpose) + + @test (transpose(Matrix(A)) * x) * 1.1 ≈ y end + + + #= + Single Column test wouldn't work because + merge_csr_mv! is designed to handle Ax = y where + A is m x n, x is n x 1, and y must be m x 1. + + Given a single column A that is 1 x z, merge_csr_mv! treats it as + z x 1, which means y is constrained to 1 x 1 and the correct + output would be z x 1, which merge_csr_mv! is not designed for. + + On the other hand, the overridden SparseArrays.mul! should work + =# end # test fails, y seems to have lots of zero-entries -@testset "Square" begin - # 10 x 10 with 30% chance of entry being made +@testset "Square (Real)" begin A = SparseArrays.sprand(10,10,0.3) # 10 x 1 @@ -70,8 +90,20 @@ end end -@testset "4x6" begin - # create matrix +@testset "Square (Complex)" begin + A = SparseArrays.sprand(Complex{Float64}, 10, 10, 0.3) + + x = 10 * rand(Complex{Float64}, 10) + + y = zeros(eltype(A), size(x)) + + merge_csr_mv!(1.1, A, x, y, adjoint) + + @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y +end + +@testset "4x6 (Real)" begin + # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix m = [10 20 0 0 0 0; 0 30 0 40 0 0; 0 0 50 60 70 0; @@ -81,7 +113,6 @@ end A = sparse(m) # create vector - # x = [5,2,3,1,8,2] x = [5,2,3,1] # create empty solution @@ -94,7 +125,30 @@ end @test Matrix(adjoint(m)) * x * 2.0 == y end -@testset "100x100" begin +@testset "4 x 6 (Complex)" begin + # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix + m = [10 20 0 0 0 0; + 0 30 0 40 0 0; + 0 0 50 60 70 0; + 0 0 0 0 0 80] + + # get into CSC form + A = sparse(m) + + # create vector + x = 22.1 * rand(Complex{Float64}, 4) + + # create empty solution + y = zeros(eltype(x), size(A, 2)) + + # multiply + merge_csr_mv!(2.0, A, x, y, adjoint) + + @test Matrix(adjoint(m)) * x * 2.0 == y + +end + +@testset "100x100 (Real)" begin # create matrix A = SparseArrays.sprand(100, 100, 0.3) @@ -109,29 +163,63 @@ end @test transpose(Matrix(A)) * x * 3 ≈ y end -@testset "Matrix x Matrix" begin +@testset "100x100 (Complex)" begin + # create matrix + A = SparseArrays.sprand(Complex{Float64}, 100, 100, 0.3) + + # create vector + x = rand(Complex{Float64}, 100) + + # create empty solution + y = zeros(eltype(x), size(A, 1)) + + merge_csr_mv!(3.0, A, x, y, transpose) + + @test transpose(Matrix(A)) * x * 3 ≈ y +end + +#= + NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul! + should be the preferred method to do Matrix-Matrix multiplication. +=# +@testset "Matrix-Matrix (Real)" begin - ## Calculate AXα # Create Matrices - # 3 x 2 + # 3 x 2 (treated as 2 x 3 in merge_csr_mv!) A = sprand(3, 2, 0.4) # 3 x 4 X = rand(3, 4) - # set alpha to 1.0 for now - α = 1.0 + α = 9.2 - # Create place to store solution, - # should be 2 x 4 Y = zeros(2, 4) - # iterate - # some kind of indexing issue going on here for (idx, col) in enumerate(eachcol(X)) Y_view = @view Y[:, idx] merge_csr_mv!(α, A, col, Y_view, transpose) end - @test transpose(Matrix(A)) * X * 1.0 ≈ Y + @test transpose(Matrix(A)) * X * 9.2 ≈ Y + +end + +@testset "Matrix-Matrix (Real)" begin + + # Create Matrices + # 3 x 2 (treated as 2 x 3 in merge_csr_mv!) + A = sprand(Complex{Float64}, 3, 2, 0.4) + # 3 x 4 + X = rand(3, 4) + + α = 9.2 + + Y = zeros(eltype(A), 2, 4) + + for (idx, col) in enumerate(eachcol(X)) + Y_view = @view Y[:, idx] + merge_csr_mv!(α, A, col, Y_view, adjoint) + end + + @test adjoint(Matrix(A)) * X * 9.2 ≈ Y end \ No newline at end of file From 87852a68e2dec0f993aff1c0944ebe223eb288cb Mon Sep 17 00:00:00 2001 From: John Long Date: Wed, 4 Jan 2023 14:34:29 -0500 Subject: [PATCH 26/26] Added test for complex and real-valued matrices/vectors/coefficients in mul --- test/mul!.jl | 127 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 112 insertions(+), 15 deletions(-) diff --git a/test/mul!.jl b/test/mul!.jl index 16e3683..5b22b6a 100644 --- a/test/mul!.jl +++ b/test/mul!.jl @@ -2,8 +2,25 @@ using Test using ParallelMergeCSR using SparseArrays -# trigger merge_csr_mv! -@testset "Adjoint" begin +# trigger merge_csr_mv! in this repo, does not default to mul! somewhere else +@testset "Adjoint Real" begin + + A = adjoint(sprand(5, 5, 0.3)) + + B = rand(5, 5) + α = 11.2 + + C = rand(5, 5) + C_copy = deepcopy(C) + β = 3.9 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Adjoint Complex" begin # C = adjoint(A)Bα + Cβ A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3)) @@ -17,10 +34,11 @@ using SparseArrays SparseArrays.mul!(C, A, B, α, β) @test C ≈ Matrix(A) * B * α + C_copy * β -end +end # trigger merge_csr_mv! -@testset "Transpose Square" begin + +@testset "Transpose Square Real" begin # C = transpose(A)Bα + Cβ A = transpose(sparse(rand(1:5, 4, 4))) @@ -35,29 +53,62 @@ end # right hand side is correct, left hand side is problematic @test C ≈ Matrix(A) * B * α + C_copy * β -end -@testset "Transpose Rectangular" begin +end - A = sparse([1 2 3 4; 5 6 7 8]) |> transpose - B = [1 2 3; 4 5 6] +@testset "Transpose Square Complex" begin + # C = transpose(A)Bα + Cβ + A = transpose(sparse(rand(Complex{Float64}, 4, 4))) + B = rand(4,4) α = 1.0 - C = zeros(4, 3) + C = zeros(eltype(A), 4, 4) + C_copy = deepcopy(C) + β = 6.11 + 9.2im + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β +end + +@testset "Transpose Rectangular Real" begin + + A = sparse(rand(4, 2)) |> transpose + B = rand(4, 3) + + α = 9.1 + + C = zeros(eltype(A), 2, 3) C_copy = deepcopy(C) β = 1.0 SparseArrays.mul!(C, A, B, α, β) @test C ≈ Matrix(A) * B * α + C_copy * β + end -@testset "Matrix x Matrix" begin +@testset "Transpose Rectangular Complex" begin + + A = sparse(rand(Complex{Float64}, 4, 2)) |> transpose + B = rand(4, 3) + + α = 0.0 + 5.3im + + C = zeros(eltype(A), 2, 3) + C_copy = deepcopy(C) + β = 1.0 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end - # C = ABα + Cβ - ## A should be CSC matrix, B should be Dense - ## α, β are `Number`s +@testset "Matrix x Matrix (Real)" begin + + # A should be sparse, B should be dense A = sprand(5, 5, 0.2) B = rand(5, 5) α = 1.1 @@ -71,7 +122,23 @@ end end -@testset "Matrix x Vector" begin +@testset "Matrix x Matrix (Complex)" begin + + # A should be sparse, B should be dense + A = sprand(5, 5, 0.2) + B = rand(Complex{Float64}, 5, 5) + α = 1.1 + + C = zeros(eltype(B), 5, 5) + C_copy = deepcopy(C) + β = 0.3 + + SparseArrays.mul!(C, A, B, α, β) + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Vector (Real)" begin A = sprand(5, 5, 0.3) B = rand(5) @@ -84,6 +151,36 @@ end SparseArrays.mul!(C, A, B, α, β) @test C ≈ Matrix(A) * B * α + C_copy * β - +end + +@testset "Matrix x Vector (Real)" begin + + A = sprand(5, 5, 0.3) + B = rand(5) + α = 2.5 + + C = zeros(Float64, size(B)) + C_copy = deepcopy(C) + β = 5.2 + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β + +end + +@testset "Matrix x Vector (Complex)" begin + + A = sprand(5, 5, 0.3) + B = rand(Complex{Float64}, 5) + α = 2.5 + + C = zeros(eltype(B), size(B)) + C_copy = deepcopy(C) + β = 2.1+0.1im + + SparseArrays.mul!(C, A, B, α, β) + + @test C ≈ Matrix(A) * B * α + C_copy * β end \ No newline at end of file