diff --git a/README.md b/README.md
index f3806db..10280a6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,60 @@
# ParallelMergeCSR.jl
-A Sparse Matrix library implemented Merge-based Parallel Sparse Matrix-Vector Multiplication.
-see [this paper](https://rd.yyrcd.com/CUDA/2022-03-14-Merge-based%20Parallel%20Sparse%20Matrix-Vector%20Multiplication.pdf)
+[](https://github.com/QuEraComputing/Bloqade.jl/actions)
+
+
+An implementation/port of Merrill and Garland's Merge-based Parallel Sparse Matrix-Vector Multiplication (10.1109/SC.2016.57) paper in
+the
+
+
+ Julia Programming Language
+
+
+
+
+ParallelMergeCSR allows you to perform *multithreaded* [CSC formatted sparse Matrix](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29) multiplication against dense vectors and matrices as long as the sparse Matrix has had a **transpose** or **adjoint** operation applied to it via `LinearAlgebra`, built-in to Julia Base. The reason for this is the original algorithm was restricted to [CSR formatted sparse Matrices](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) but by taking the transpose of a CSC matrix you've created a CSR representation of the same matrix.
+
+ParallelMergeCSR only exports one function: `mul!` which is used for both Sparse Matrix - Dense Vector and Sparse Matrix - Dense Matrix multiplication.
+
+## Installation
+
+Enter the Julia Package Manager and install via:
+
+```
+pkg> add ParallelMergeCSR
+```
+
+## Usage
+
+Start Julia with the desired number of threads by launching it with the `-t`/`--threads` argument:
+```
+julia --threads
+```
+or setting `JULIA_NUM_THREADS` in your environment and then running Julia.
+
+You can confirm Julia is using the specified number of threads via:
+
+```julia
+julia> Threads.nthreads()
+```
+
+You can then use `ParallelMergeCSR` in a similar fashion to the example below:
+
+```julia
+julia> using ParallelMergeCSR, SparseArrays
+
+# create a 20x20 transposed CSC-formatted Sparse matrix with a 30% chance of values appearing
+julia> A = transpose(sprand(20, 20, 0.3));
+
+# dense vector (can be a matrix too)
+julia> B = rand(size(A, 2));
+
+# output
+julia> C = rand(size(A, 2));
+
+# coefficients
+julia> α = -1.0; β = 2.9;
+
+# perform the operation C = ABα + Cβ, mutating C to store the answer
+julia> ParallelMergeCSR.mul!(C, A, B, α, β)
+```
diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index c2662ea..84bfdac 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -1,5 +1,12 @@
+"""
+ Range <: AbstractVector{Int}
+Functionally equivalent to `CountingInputIterator` from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+Acts as a vector of integers from `start` to `stop` and used to represent indices to the non-zero values of the sparse matrix.
+
+See also: merge_csr_mv!
+"""
struct Range <: AbstractVector{Int}
start::Int
stop::Int
@@ -8,29 +15,65 @@ struct Range <: AbstractVector{Int}
end
end
+"""
+ Base.length(range::Range)
+
+Gets the length of a range as the number of integers represented from its `start` and `stop`
+inclusive on both ends.
+"""
Base.length(range::Range) = (range.stop-range.start) # includes both ends
+
+"""
+ Base.size(range::Range)
+
+Gets the size of a range, which is equal to its length.
+
+See also: `Base.length(range::Range)`
+"""
Base.size(range::Range) = (length(range),)
+"""
+ Base.getindex(range::Range,index::Int)
+
+Index into a `Range` using `index` with an exception thrown if the index falls out of bounds
+of the range.
+"""
@inline function Base.getindex(range::Range,index::Int)
@boundscheck (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
return index + range.start
end
+"""
+ mutable struct Coordinate
+An implementation of the C++ `CoordinateT` struct from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Holds the "coordinate" (a combination of a row offset value and an index into the non-zero values of
+generated by `merge_path_search`.
+
+See also: `merge_path_search`
+"""
mutable struct Coordinate
x::Int
y::Int
end
-# Want to give the illusion that we're still using 0-based coordinates when the actual
-# values themselves are 1-based
-#
-# diagonal -> i + j = k
-# a_len and b_len can stay the same from the paper
-#
-# a -> row-end offsets so really pass in a[2:end]
-# b -> "natural" numbers
+
+"""
+ merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector)
+
+An implementation of the C++ `MergePathSearch` function from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Given a `diagonal` the grid formed by the row offsets `a` and the indices of the non-zero values `b`
+as well as their respective lengths `a_len` and `b_len`, find the coordinate on the grid where a thread could
+begin/end at for its workload.
+"""
function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector)
+
+ # diagonal -> i + j = k
+ # a_len and b_len can stay the same from the paper
+ # a are row-end offsets so really pass in a[2:end]
+ # b are "natural" numbers (need to start at 1 instead of paper's 0 due to Julia 1-indexing)
# Diagonal search range (in x coordinate space)
x_min = max(diagonal - b_len, 0)
x_max = min(diagonal, a_len)
@@ -53,7 +96,18 @@ end
# Internally treats the matrix as if its been transposed/is an adjoint
# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
-function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+"""
+ merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+
+An implementation of the C++ `OmpMergeCsrmv` function from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Performs the operation A * input * α + output = output.
+
+Note that while A is a CSC-formatted sparse matrix, internally the function treats it as a CSR matrix (e.g. if you passed
+in a 2x4, internally it's a 4x2). The `op` argument is used to handle the case where the the CSC matrix should be treated
+as if it's had the adjoint applied to it.
+"""
+function merge_csr_mv!(α::Number, A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
nzv = nonzeros(A)
rv = rowvals(A)
@@ -61,11 +115,10 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
nnz = length(nzv)
- # nrows = length(cp) - 1 can give the wrong number of rows!
nrows = A.n
nz_indices = Range(1,nnz)
- row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+ row_end_offsets = cp[2:end]
num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
num_threads = nthreads()
@@ -117,27 +170,31 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
end
-# C = adjoint(A)Bα + Cβ
-# C = transpose(A)B + Cβ
-# C = xABα + Cβ
for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
- @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
- # obtains the original matrix underneath the "lazy wrapper"
- A = xA.parent
- size(A, 2) == size(C, 1) || throw(DimensionMismatch())
- size(A, 1) == size(B, 1) || throw(DimensionMismatch())
- size(B, 2) == size(C, 2) || throw(DimensionMismatch())
- if β != 1
- # preemptively handle β so we just handle C = ABα + C
- β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
- end
- for (col_idx, input) in enumerate(eachcol(B))
- output = @view C[:, col_idx]
- merge_csr_mv!(α, A, input, output, $t)
+ @eval begin
+ """
+ mul!(C::StridedVecOrMat, xA::$($T){<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+
+ Performs the operation C = xABα + Cβ where xA is an `AbstractSparseMatrixCSC` that has been wrapped by `$($T)`
+ (accomplishable by calling `$($t)` on the CSC matrix).
+ """
+ function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+ # obtains the original matrix underneath the "lazy wrapper"
+ A = xA.parent
+ size(A, 2) == size(C, 1) || throw(DimensionMismatch())
+ size(A, 1) == size(B, 1) || throw(DimensionMismatch())
+ size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+ if β != 1
+ # preemptively handle β so we just handle C = ABα + C
+ β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+ end
+ for (col_idx, input) in enumerate(eachcol(B))
+ output = @view C[:, col_idx]
+ merge_csr_mv!(α, A, input, output, $t)
+ end
+ C
+ # end of @eval macro
end
- C
- # end of @eval macro
- end
# end of for loop
-
+ end
end
\ No newline at end of file
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index d7beca7..3f4ef08 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -19,9 +19,6 @@ using SparseArrays: sprand, sparse
end
-## NOTE: Sparse matrices are converted to dense form in the @test's
-## considering that our redefinition of SparseArrays.mul! seems to
-## interfere
@testset "Extreme Cases" begin
@testset "Singleton (Real)" begin
A = sparse(reshape([1], 1, 1))
@@ -90,7 +87,6 @@ end
=#
end
-# test fails, y seems to have lots of zero-entries
@testset "Square (Real)" begin
A = sprand(10,10,0.3)
@@ -195,7 +191,7 @@ end
end
#=
- NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul!
+ NOTE: While merge_csr_mv! can be used this way, the mul!
should be the preferred method to do Matrix-Matrix multiplication.
=#
@testset "Matrix-Matrix (Real)" begin