From 51c8ece3d32e43e993b099eba02dcc9957296cce Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Tue, 20 Dec 2022 16:45:02 -0500
Subject: [PATCH 01/26] removing old files and setting todos

---
 src/ParallelMergeCSR.jl | 136 ++++++++++++++++++++++++++++++++++++++--
 src/threaded.jl         |   0
 src/types.jl            |   0
 test/threaded.jl        |   9 ---
 test/types.jl           |   0
 5 files changed, 131 insertions(+), 14 deletions(-)
 delete mode 100644 src/threaded.jl
 delete mode 100644 src/types.jl
 delete mode 100644 test/threaded.jl
 delete mode 100644 test/types.jl

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index ca33a3a..f9853bd 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -1,13 +1,139 @@
 module ParallelMergeCSR
 
+using SparseArrays: AbstractSparseMatrixCSC
+using LinearAlgebra:Adjoint,adjoint,Transpose,transpose
 
-using SparseArrays
+using Base.Threads
 
-include("types.jl")
-include("threaded.jl")
+mutable struct Coordinate
+    x::Int
+    y::Int
+end
 
-# include("gpu.jl")
-# include("distributed.jl")
+# Want to give the illusion that we're still using 0-based coordinates when the actual
+# values themselves are 1-based
+# 
+# diagonal -> i + j = k 
+# a_len and b_len can stay the same from the paper
+# 
+# a -> row-end offsets so really pass in a[2:end]
+# b -> "natural" numbers
+function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
+    # Diagonal search range (in x coordinate space)
+    x_min = max(diagonal - b_len, 0)
+    x_max = min(diagonal, a_len)
+    # 2D binary-search along diagonal search range
+    while (x_min < x_max)
+        pivot = (x_min + x_max) >> 1
+        if (a[pivot + 1] <= b[diagonal - pivot])
+            x_min = pivot + 1
+        else
+            x_max = pivot
+        end
+    end
+
+    return Coordinate(
+        min(x_min, a_len),
+        diagonal - x_min
+    )
+
+end
+
+# Ax = y
+# A matrix
+# x, y vectors
+
+# TODO: overload SparseArrays.mul!
+# TODO: add β and α arguments
+# TODO: change SparseMatrixCSR => Transpose{AbstractSparseMatrixCSC}
+#     1. colval => rowval 
+#     2. rowptr => colptr
+#     3. use iterface of SparseArrays to get these quantities
+function merge_csr_mv!(A::SparseMatrixCSR, x, y)
+    row_end_offsets = A.rowptr[2:end]
+    nnz_indices = collect(1:length(A.nzval))
+    num_merge_items = length(A.nzval) + A.m
+
+    num_threads = nthreads()
+    items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
+
+    row_carry_out = zeros(Int, num_threads)
+    value_carry_out = zeros(Float64, num_threads)
+
+    # Julia threads start id by 1, so make sure to offset!
+    @threads for tid in 1:num_threads
+        diagonal = min(items_per_thread * (tid - 1), num_merge_items)
+        diagonal_end = min(diagonal + items_per_thread, num_merge_items)
+
+        # Get starting and ending thread coordinates (row, nnz)
+        thread_coord = merge_path_search(diagonal, A.m, length(A.nzval), row_end_offsets, nnz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, A.m, length(A.nzval), row_end_offsets, nnz_indices)
+
+        # Consume merge items, whole rows first
+        running_total = 0.0        
+        while thread_coord.x < thread_coord_end.x
+            while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
+                running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]]
+                thread_coord.y += 1
+            end
+
+            y[thread_coord.x + 1] = running_total
+            running_total = 0.0
+            thread_coord.x += 1 
+        end
+       
+        # May have thread end up partially consuming a row.
+        # Save result form partial consumption and do one pass at the end to add it back to y
+        while thread_coord.y < thread_coord_end.y
+            running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]]
+            thread_coord.y += 1
+        end
+
+        # Save carry-outs
+        row_carry_out[tid] = thread_coord_end.x + 1
+        value_carry_out[tid] = running_total
+
+    end
+
+    # Diagonistcs
+    # println("row_carry_out: $row_carry_out")
+    # println("value_carry_out: $value_carry_out")
+    # println("Incomplete y: $y")
+    for tid in 1:num_threads
+        if row_carry_out[tid] <= A.m
+            y[row_carry_out[tid]] += value_carry_out[tid]
+        end
+    end
+
+end
+
+
+# y => C[:,k]
+# x => B[:,k]
+for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
+    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+        A = xA.parent
+        size(A, 2) == size(C, 1) || throw(DimensionMismatch())
+        size(A, 1) == size(B, 1) || throw(DimensionMismatch())
+        size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+        nzv = nonzeros(A)
+        rv = rowvals(A)
+        if β != 1
+            β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
+        end
+        for k in 1:size(C, 2)
+            # parallel implementation goes here acting on slice C[:,k]
+            @inbounds for col in 1:size(A, 2)
+                tmp = zero(eltype(C))
+                for j in nzrange(A, col)
+                    tmp += $t(nzv[j])*B[rv[j],k]
+                end
+                C[col,k] += tmp * α
+            end
+        end
+        C
+    end
+end
 
 
 end
\ No newline at end of file
diff --git a/src/threaded.jl b/src/threaded.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/types.jl b/src/types.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/test/threaded.jl b/test/threaded.jl
deleted file mode 100644
index 08debd5..0000000
--- a/test/threaded.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-using SparseArrays
-using Random
-
-
-
-
-
-
-
diff --git a/test/types.jl b/test/types.jl
deleted file mode 100644
index e69de29..0000000

From 931e670ae0b52279f8f49a9a54d42563280cc64e Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Wed, 28 Dec 2022 12:36:04 -0500
Subject: [PATCH 02/26] override SparseArrays Mul!

---
 src/ParallelMergeCSR.jl | 81 ++++++++++++++++++++++++++++++-----------
 1 file changed, 59 insertions(+), 22 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index f9853bd..dad0316 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -49,10 +49,14 @@ end
 #     1. colval => rowval 
 #     2. rowptr => colptr
 #     3. use iterface of SparseArrays to get these quantities
-function merge_csr_mv!(A::SparseMatrixCSR, x, y)
-    row_end_offsets = A.rowptr[2:end]
-    nnz_indices = collect(1:length(A.nzval))
-    num_merge_items = length(A.nzval) + A.m
+function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op)
+    rv = rowvals(A)
+    nzval = nonzeros(A)
+    nrow = size(A,2) # view CSC tranpose as CSR.
+
+    row_end_offsets = rowvals[2:end]
+    nnz_indices = collect(1:length(nzval))
+    num_merge_items = length(nzval) + nrow
 
     num_threads = nthreads()
     items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
@@ -66,26 +70,26 @@ function merge_csr_mv!(A::SparseMatrixCSR, x, y)
         diagonal_end = min(diagonal + items_per_thread, num_merge_items)
 
         # Get starting and ending thread coordinates (row, nnz)
-        thread_coord = merge_path_search(diagonal, A.m, length(A.nzval), row_end_offsets, nnz_indices)
-        thread_coord_end = merge_path_search(diagonal_end, A.m, length(A.nzval), row_end_offsets, nnz_indices)
+        thread_coord = merge_path_search(diagonal, nrow, length(nzval), row_end_offsets, nnz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrow, length(nzval), row_end_offsets, nnz_indices)
 
         # Consume merge items, whole rows first
-        running_total = 0.0        
+        running_total = zero(eltype(y)) 
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]]
+                running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            y[thread_coord.x + 1] = running_total
-            running_total = 0.0
+            output[thread_coord.x + 1] = running_total
+            running_total = zero(eltype(y)) 
             thread_coord.x += 1 
         end
        
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            running_total += A.nzval[thread_coord.y + 1] * x[A.colval[thread_coord.y + 1]]
+            running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
             thread_coord.y += 1
         end
 
@@ -100,8 +104,8 @@ function merge_csr_mv!(A::SparseMatrixCSR, x, y)
     # println("value_carry_out: $value_carry_out")
     # println("Incomplete y: $y")
     for tid in 1:num_threads
-        if row_carry_out[tid] <= A.m
-            y[row_carry_out[tid]] += value_carry_out[tid]
+        if row_carry_out[tid] <= nrow
+            output[row_carry_out[tid]] += value_carry_out[tid]
         end
     end
 
@@ -116,24 +120,57 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
         size(A, 1) == size(B, 1) || throw(DimensionMismatch())
         size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        nzv = nonzeros(A)
-        rv = rowvals(A)
+        # nzv = nonzeros(A)
+        # rv = rowvals(A)
         if β != 1
             β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
         for k in 1:size(C, 2)
             # parallel implementation goes here acting on slice C[:,k]
-            @inbounds for col in 1:size(A, 2)
-                tmp = zero(eltype(C))
-                for j in nzrange(A, col)
-                    tmp += $t(nzv[j])*B[rv[j],k]
-                end
-                C[col,k] += tmp * α
-            end
+            y = @view B[:,k]
+            x = @view C[:,k]
+            merge_csr_mv!(A, x, y, $t)
         end
         C
     end
 end
 
 
+function atomic_add(a::AbstractVector{T},b::T) where T <: Real
+    @atomic a[1] += b
+end
+
+function atomic_add(a::AbstractVector{T},b::T) where T <: Complex
+    view_a = reinterpret(real(T),a)
+    real_b = real(b)
+    imag_b = imag(b)
+
+    @atomic view_a[1] += real_b
+    @atomic view_a[2] += imag_b
+end
+
+function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    size(A, 1) == size(C, 1) || throw(DimensionMismatch())
+    size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    if β != 1
+        β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
+    end
+    for k in 1:size(C, 2)
+        @threads for col in 1:size(A, 2)
+            @inbounds αxj = B[col,k] * α
+            for j in nzrange(A, col)
+                @inbounds val = nzv[j]*αxj
+                @inbounds row = rv[j]
+                @inbounds out = C[row:row, k] # pass element as view
+                atomic_add(out,val) 
+            end
+        end
+    end
+    C
+end
+
+
 end
\ No newline at end of file

From e8753b07c89fdd5e30d1a307e85d26225310199b Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Wed, 28 Dec 2022 12:55:48 -0500
Subject: [PATCH 03/26] updating TODOs

---
 src/ParallelMergeCSR.jl | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index dad0316..60cb6f9 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -43,12 +43,9 @@ end
 # A matrix
 # x, y vectors
 
-# TODO: overload SparseArrays.mul!
-# TODO: add β and α arguments
-# TODO: change SparseMatrixCSR => Transpose{AbstractSparseMatrixCSC}
-#     1. colval => rowval 
-#     2. rowptr => colptr
-#     3. use iterface of SparseArrays to get these quantities
+# TODO: 
+# 1. add @inbounds to remove bounds check
+# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix 
 function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op)
     rv = rowvals(A)
     nzval = nonzeros(A)
@@ -125,11 +122,9 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
         if β != 1
             β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
-        for k in 1:size(C, 2)
-            # parallel implementation goes here acting on slice C[:,k]
-            y = @view B[:,k]
-            x = @view C[:,k]
-            merge_csr_mv!(A, x, y, $t)
+        for (k,c) in enumerate(eachcol(C))
+            # parallel implementation goes here acting on slice C[:,k], B[:,k]
+            merge_csr_mv!(A, B[:,k], c, $t)
         end
         C
     end

From cd96e3485e587c4d91f1c65f417a80cb03101481 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Wed, 28 Dec 2022 12:56:01 -0500
Subject: [PATCH 04/26] adding tests

---
 test/runtests.jl | 103 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)

diff --git a/test/runtests.jl b/test/runtests.jl
index e69de29..42a40a2 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -0,0 +1,103 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+# works with AbstractSparseMatrixCSC or just plain AbstractMatrix
+m_to_csr(M::AbstractMatrix) =  M |> SparseArrays.sparse |> ParallelMergeCSR.SparseMatrixCSR
+
+@testset "Extreme Cases" begin
+
+    @testset "Singleton" begin
+        full_m = reshape([1], 1, 1)
+        csr_m = m_to_csr(full_m)
+
+        x = rand(1)
+
+        y = zeros(csr_m.m)
+
+        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+        @test full_m * x == y
+    end
+
+    @testset "Single row" begin
+
+        csc_m = SparseArrays.sprand(10, 1, 0.3)
+        csr_m = m_to_csr(csc_m)
+
+        x = rand(1:10, 1)
+
+        y = zeros(csr_m.m)
+
+        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+        @test csc_m * x ≈ y
+    end
+
+    @testset "Single column" begin
+
+        csc_m = SparseArrays.sprand(10, 1, 0.3)
+        csr_m = m_to_csr(csc_m)
+
+        x = rand(1:10, 1)
+
+        y = zeros(csr_m.m)
+
+        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+        @test csc_m * x ≈ y
+    end
+end
+
+@testset "Square" begin
+    # 10 x 10 with 30% chance of entry being made
+    full_m = SparseArrays.sprand(10,10,0.3)
+    csr_m = m_to_csr(full_m)
+
+    x = rand(1:10, 10, 1)
+
+    y = zeros(csr_m.m)
+
+    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+    @test full_m * x ≈ y
+
+end
+
+@testset "4x6" begin
+    # create matrix
+    full_m = [10 20 0 0 0 0;
+              0 30 0 40 0 0;
+              0 0 50 60 70 0;
+              0 0 0 0 0 80]
+
+    # get into CSR form
+    csr_m = m_to_csr(full_m)
+
+    # create vector
+    x = [5,2,3,1,8,2]
+
+    # create empty solution
+    y = zeros(Int64, csr_m.m)
+    
+    # multiply
+    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+    @test full_m * x == y
+end
+
+@testset "100x100" begin
+    # create matrix
+    csc_m = SparseArrays.sprand(100, 100, 0.3)
+    csr_m = m_to_csr(csc_m)
+
+    # create vector
+    x = rand(1:100, 100, 1)
+
+    # create empty solution
+    y = zeros(csr_m.m)
+
+    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+
+    @test csc_m * x ≈ y
+end
\ No newline at end of file

From 8e12ae2d61a4d3a90684a61a12d5a5977fe7ff72 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Wed, 28 Dec 2022 12:58:54 -0500
Subject: [PATCH 05/26] adding += operator

---
 src/ParallelMergeCSR.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 60cb6f9..7ed32db 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -78,7 +78,7 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output:
                 thread_coord.y += 1
             end
 
-            output[thread_coord.x + 1] = running_total
+            output[thread_coord.x + 1] += running_total
             running_total = zero(eltype(y)) 
             thread_coord.x += 1 
         end

From ededb981621f542103f97cd9706b34eb934e80d6 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Wed, 28 Dec 2022 17:08:38 -0500
Subject: [PATCH 06/26] Fixed broken .toml

---
 Project.toml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Project.toml b/Project.toml
index c9d8f73..53aaee3 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,2 +1,7 @@
+name = "ParallelMergeCSR"
+uuid = "c4b929cc-6141-4ba4-8849-280a863346e1"
+version = "0.1.0"
+
 [deps]
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"

From 4e979f52066a3087133629edbd998ec404e2543e Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Thu, 29 Dec 2022 13:48:23 -0500
Subject: [PATCH 07/26] Added testing ability to .toml, merge_csr_mv! works
 with CSC format and Number multiplier

---
 Project.toml            |  7 ++++
 src/ParallelMergeCSR.jl | 91 ++++++++++++++++++++++++++++-------------
 test/runtests.jl        | 65 +++++++++++++----------------
 3 files changed, 99 insertions(+), 64 deletions(-)

diff --git a/Project.toml b/Project.toml
index 53aaee3..5f34ef8 100644
--- a/Project.toml
+++ b/Project.toml
@@ -5,3 +5,10 @@ version = "0.1.0"
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
\ No newline at end of file
diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 7ed32db..e0caf3a 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -1,7 +1,9 @@
 module ParallelMergeCSR
 
-using SparseArrays: AbstractSparseMatrixCSC
-using LinearAlgebra:Adjoint,adjoint,Transpose,transpose
+using SparseArrays
+using SparseArrays: AbstractSparseMatrixCSC,
+                    DenseInputVecOrMat
+using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
 
 using Base.Threads
 
@@ -43,17 +45,51 @@ end
 # A matrix
 # x, y vectors
 
+#=
+Algorithm originally designed for CSR now needs to work for CSC
+
+CSC has fields:
+* m - number of rows
+* n - number of columns
+* colptr::Vector - column j is in colptr[j]:(colptr[j+1]-1)
+* rowval::Vector - row indices of stored values
+* nzval::Vector  - stored values, typically nonzeros
+
+CSR used to have fields: 
+* m - number of rows
+* n - number of columns
+* rowptr::Vector - row i is in rowptr[i]:(rowptr[i+1]-1)
+* colval::Vector - col indices of stored values
+* nzval::Vector - Stored values, typically non-zeros
+
+Given a matrix A in CSC, if you do transpose(A), indexing
+individual elements is fine but it's a lazy transpose.
+To materialize the changes, probably requires you perform some copy operation
+e.g. can see changes via dump(copy(transpose(m))) where m isa AbstractMatrixCSC
+
+* NOTE: we don't want the transpose to actually change the matrix dimensions,
+we do it just so the INTERNAL representation of the matrix looks like a CSR
+=#
+
+
 # TODO: 
 # 1. add @inbounds to remove bounds check
-# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix 
-function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector,op)
-    rv = rowvals(A)
-    nzval = nonzeros(A)
-    nrow = size(A,2) # view CSC tranpose as CSR.
-
-    row_end_offsets = rowvals[2:end]
-    nnz_indices = collect(1:length(nzval))
-    num_merge_items = length(nzval) + nrow
+# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix
+# StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place 
+# Axβ = y
+function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
+
+    # transpose the CSC to CSR
+    ## colptr in CSC equiv. to rowptr in CSR
+    ## rowval in CSC equiv. to colval in CSR
+    ## rows are now columns so the m x n dimensions after transpose are n x m
+    At = copy(transpose(A))
+    row_end_offsets = At.colptr[2:end]
+
+    nz = nonzeros(At)
+    nrows = size(A, 1)
+    nnz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats
+    num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix
 
     num_threads = nthreads()
     items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
@@ -67,26 +103,26 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output:
         diagonal_end = min(diagonal + items_per_thread, num_merge_items)
 
         # Get starting and ending thread coordinates (row, nnz)
-        thread_coord = merge_path_search(diagonal, nrow, length(nzval), row_end_offsets, nnz_indices)
-        thread_coord_end = merge_path_search(diagonal_end, nrow, length(nzval), row_end_offsets, nnz_indices)
+        thread_coord = merge_path_search(diagonal, nrows, length(nz), row_end_offsets, nnz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrows, length(nz), row_end_offsets, nnz_indices)
 
         # Consume merge items, whole rows first
-        running_total = zero(eltype(y)) 
+        running_total = 0.0        
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+                running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            output[thread_coord.x + 1] += running_total
-            running_total = zero(eltype(y)) 
+            y[thread_coord.x + 1] = running_total * β
+            running_total = 0.0
             thread_coord.x += 1 
         end
        
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            running_total += op(nzval[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β
             thread_coord.y += 1
         end
 
@@ -96,13 +132,9 @@ function merge_csr_mv!(A::AbstractSparseMatrixCSC, input::StridedVector, output:
 
     end
 
-    # Diagonistcs
-    # println("row_carry_out: $row_carry_out")
-    # println("value_carry_out: $value_carry_out")
-    # println("Incomplete y: $y")
     for tid in 1:num_threads
-        if row_carry_out[tid] <= nrow
-            output[row_carry_out[tid]] += value_carry_out[tid]
+        if row_carry_out[tid] <= nrows
+            y[row_carry_out[tid]] += value_carry_out[tid]
         end
     end
 
@@ -117,8 +149,7 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
         size(A, 1) == size(B, 1) || throw(DimensionMismatch())
         size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        # nzv = nonzeros(A)
-        # rv = rowvals(A)
+        # move multipication by beta into the multithreaded part
         if β != 1
             β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
@@ -127,10 +158,13 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
             merge_csr_mv!(A, B[:,k], c, $t)
         end
         C
+        # end of @eval macro
     end
+    # end of for loop
 end
 
-
+#=
+# Throws error about @atomic behavior
 function atomic_add(a::AbstractVector{T},b::T) where T <: Real
     @atomic a[1] += b
 end
@@ -166,6 +200,7 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De
     end
     C
 end
+=#
 
-
+# end of the module
 end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 42a40a2..32ab8cc 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -2,102 +2,95 @@ using Test
 using ParallelMergeCSR
 using SparseArrays
 
-# works with AbstractSparseMatrixCSC or just plain AbstractMatrix
-m_to_csr(M::AbstractMatrix) =  M |> SparseArrays.sparse |> ParallelMergeCSR.SparseMatrixCSR
 
 @testset "Extreme Cases" begin
 
     @testset "Singleton" begin
-        full_m = reshape([1], 1, 1)
-        csr_m = m_to_csr(full_m)
+        A = sparse(reshape([1], 1, 1))
 
         x = rand(1)
 
-        y = zeros(csr_m.m)
+        y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
 
-        @test full_m * x == y
+        @test A * x == y
     end
 
     @testset "Single row" begin
 
-        csc_m = SparseArrays.sprand(10, 1, 0.3)
-        csr_m = m_to_csr(csc_m)
+        A = SparseArrays.sprand(10, 1, 0.3)
 
         x = rand(1:10, 1)
 
-        y = zeros(csr_m.m)
+        y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint)
 
-        @test csc_m * x ≈ y
+        @test A * x ≈ y
     end
 
     @testset "Single column" begin
 
-        csc_m = SparseArrays.sprand(10, 1, 0.3)
-        csr_m = m_to_csr(csc_m)
+        A = SparseArrays.sprand(10, 1, 0.3)
 
         x = rand(1:10, 1)
 
-        y = zeros(csr_m.m)
+        y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
 
-        @test csc_m * x ≈ y
+        @test A * x ≈ y
     end
 end
 
 @testset "Square" begin
     # 10 x 10 with 30% chance of entry being made
-    full_m = SparseArrays.sprand(10,10,0.3)
-    csr_m = m_to_csr(full_m)
+    A = SparseArrays.sprand(10,10,0.3)
 
-    x = rand(1:10, 10, 1)
+    x = rand(10)
 
-    y = zeros(csr_m.m)
+    y = zeros(size(A, 1))
 
-    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+    ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint)
 
-    @test full_m * x ≈ y
+    @test (A * x) * 1.1 ≈ y
 
 end
 
 @testset "4x6" begin
     # create matrix
-    full_m = [10 20 0 0 0 0;
-              0 30 0 40 0 0;
-              0 0 50 60 70 0;
-              0 0 0 0 0 80]
+    m = [10 20 0 0 0 0;
+         0 30 0 40 0 0;
+         0 0 50 60 70 0;
+         0 0 0 0 0 80]
 
     # get into CSR form
-    csr_m = m_to_csr(full_m)
+    A = sparse(m)
 
     # create vector
     x = [5,2,3,1,8,2]
 
     # create empty solution
-    y = zeros(Int64, csr_m.m)
+    y = zeros(Int64, size(A, 1))
     
     # multiply
-    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+    ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint)
 
-    @test full_m * x == y
+    @test (A * x) * 2.0 == y
 end
 
 @testset "100x100" begin
     # create matrix
-    csc_m = SparseArrays.sprand(100, 100, 0.3)
-    csr_m = m_to_csr(csc_m)
+    A = SparseArrays.sprand(100, 100, 0.3)
 
     # create vector
     x = rand(1:100, 100, 1)
 
     # create empty solution
-    y = zeros(csr_m.m)
+    y = zeros(size(A, 1))
 
-    ParallelMergeCSR.merge_csr_mv!(csr_m, x, y)
+    ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose)
 
-    @test csc_m * x ≈ y
+    @test (A * x) * 3 ≈ y
 end
\ No newline at end of file

From eb4604f8845ddbccced801d81b10afa15f53adfc Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Thu, 29 Dec 2022 14:00:54 -0500
Subject: [PATCH 08/26] Added @inbounds, use nnz to denote number of nonzeros
 vs length(nz)

---
 src/ParallelMergeCSR.jl | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index e0caf3a..907a064 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -41,10 +41,6 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
 
 end
 
-# Ax = y
-# A matrix
-# x, y vectors
-
 #=
 Algorithm originally designed for CSR now needs to work for CSC
 
@@ -87,8 +83,9 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
     row_end_offsets = At.colptr[2:end]
 
     nz = nonzeros(At)
+    nnz = length(nz) 
     nrows = size(A, 1)
-    nnz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats
+    nz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats
     num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix
 
     num_threads = nthreads()
@@ -102,19 +99,19 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
         diagonal = min(items_per_thread * (tid - 1), num_merge_items)
         diagonal_end = min(diagonal + items_per_thread, num_merge_items)
 
-        # Get starting and ending thread coordinates (row, nnz)
-        thread_coord = merge_path_search(diagonal, nrows, length(nz), row_end_offsets, nnz_indices)
-        thread_coord_end = merge_path_search(diagonal_end, nrows, length(nz), row_end_offsets, nnz_indices)
+        # Get starting and ending thread coordinates (row, nz)
+        thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
 
         # Consume merge items, whole rows first
         running_total = 0.0        
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]]
+                @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            y[thread_coord.x + 1] = running_total * β
+            @inbounds y[thread_coord.x + 1] = running_total * β
             running_total = 0.0
             thread_coord.x += 1 
         end
@@ -122,7 +119,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β
+            @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β
             thread_coord.y += 1
         end
 
@@ -133,8 +130,8 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
     end
 
     for tid in 1:num_threads
-        if row_carry_out[tid] <= nrows
-            y[row_carry_out[tid]] += value_carry_out[tid]
+        @inbounds if row_carry_out[tid] <= nrows
+            @inbounds y[row_carry_out[tid]] += value_carry_out[tid]
         end
     end
 

From b62adb7da2d78671ab30d7b3db1fe8ca2f7f2ec7 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Thu, 29 Dec 2022 16:48:28 -0500
Subject: [PATCH 09/26] Attempted fix for atomic ops, renamed args for
 merge_csr_mv, add tests

---
 Project.toml            |  4 +-
 src/ParallelMergeCSR.jl | 61 +++++++++++++++----------
 test/atomic_add!.jl     | 27 +++++++++++
 test/merge_csr_mv!.jl   | 99 +++++++++++++++++++++++++++++++++++++++++
 test/mul!.jl            | 27 +++++++++++
 test/runtests.jl        | 95 +++------------------------------------
 6 files changed, 199 insertions(+), 114 deletions(-)
 create mode 100644 test/atomic_add!.jl
 create mode 100644 test/merge_csr_mv!.jl
 create mode 100644 test/mul!.jl

diff --git a/Project.toml b/Project.toml
index 5f34ef8..f1a521d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,12 +3,12 @@ uuid = "c4b929cc-6141-4ba4-8849-280a863346e1"
 version = "0.1.0"
 
 [deps]
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 
-
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [targets]
-test = ["Test"]
\ No newline at end of file
+test = ["Test"]
diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 907a064..b0692d9 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -7,6 +7,8 @@ using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
 
 using Base.Threads
 
+using Atomix
+
 mutable struct Coordinate
     x::Int
     y::Int
@@ -73,7 +75,7 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR
 # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix
 # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place 
 # Axβ = y
-function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
+function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op)
 
     # transpose the CSC to CSR
     ## colptr in CSC equiv. to rowptr in CSR
@@ -107,11 +109,11 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
         running_total = 0.0        
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]]
+                @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            @inbounds y[thread_coord.x + 1] = running_total * β
+            @inbounds output[thread_coord.x + 1] = running_total * β
             running_total = 0.0
             thread_coord.x += 1 
         end
@@ -119,7 +121,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            @inbounds running_total += op(nz[thread_coord.y + 1]) * x[At.rowval[thread_coord.y + 1]] * β
+            @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] * β
             thread_coord.y += 1
         end
 
@@ -131,48 +133,62 @@ function merge_csr_mv!(A::SparseMatrixCSC, x, β::Number, y, op)
 
     for tid in 1:num_threads
         @inbounds if row_carry_out[tid] <= nrows
-            @inbounds y[row_carry_out[tid]] += value_carry_out[tid]
+            @inbounds output[row_carry_out[tid]] += value_carry_out[tid]
         end
     end
 
 end
 
 
-# y => C[:,k]
-# x => B[:,k]
+# C = adjoint(A)Bα + Cβ
+# C = transpose(A)B + Cβ
+# C = xABα + Cβ
 for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
     @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+        # obtains the original matrix underneath the "lazy wrapper"
         A = xA.parent
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
         size(A, 1) == size(B, 1) || throw(DimensionMismatch())
         size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        # move multipication by beta into the multithreaded part
-        if β != 1
-            β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
-        end
-        for (k,c) in enumerate(eachcol(C))
-            # parallel implementation goes here acting on slice C[:,k], B[:,k]
-            merge_csr_mv!(A, B[:,k], c, $t)
+        # move multiplication by beta into the multithreaded part
+        ABα = similar(C)
+        for (row_idx, col) in enumerate(eachcol(B))
+            # merge_csr_mv!(A, x, β, y, op)
+            merge_csr_mv!(A, col, α, ABα[row_idx, :], $t)
         end
+        C = ABα + C*β
         C
         # end of @eval macro
     end
     # end of for loop
 end
 
-#=
+
 # Throws error about @atomic behavior
-function atomic_add(a::AbstractVector{T},b::T) where T <: Real
-    @atomic a[1] += b
+## Probably meant to add b to every element of a while mutating a?
+## function names should be suffixed w/ exclamation mark
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Real
+    # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes
+    # you'd have to wrap each
+    # Objective: for each element in AbstractVector, atomically add b to the element
+    # Can also use Atomix.jl
+    for idx in 1:length(a)
+        Atomix.@atomic a[idx] += b
+    end
+    
 end
 
-function atomic_add(a::AbstractVector{T},b::T) where T <: Complex
-    view_a = reinterpret(real(T),a)
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
+    # return type representing real part
+    view_a = reinterpret(reshape, real(T),a)
     real_b = real(b)
     imag_b = imag(b)
 
-    @atomic view_a[1] += real_b
-    @atomic view_a[2] += imag_b
+    # mutating the view also mutates the underlying array it came from
+    for idx in 1:size(view_a, 2)
+        Atomix.@atomic view_a[1, idx] += real_b
+        Atomix.@atomic view_a[2, idx] += imag_b
+    end
 end
 
 function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
@@ -191,13 +207,12 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De
                 @inbounds val = nzv[j]*αxj
                 @inbounds row = rv[j]
                 @inbounds out = C[row:row, k] # pass element as view
-                atomic_add(out,val) 
+                atomic_add!(out,val) 
             end
         end
     end
     C
 end
-=#
 
 # end of the module
 end
\ No newline at end of file
diff --git a/test/atomic_add!.jl b/test/atomic_add!.jl
new file mode 100644
index 0000000..5814afa
--- /dev/null
+++ b/test/atomic_add!.jl
@@ -0,0 +1,27 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+@testset "atomic_add!" begin
+    # atomic add with real
+    @testset "Real" begin
+        a = rand(Float64, 20)
+        a_copy = deepcopy(a)
+
+        b = 1.25
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+
+    # atomic add with complex
+    @testset "Complex" begin
+        a = rand(Complex{Float64}, 20)
+        a_copy = deepcopy(a)
+
+        b = 2.5 + 0.8im
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+end
\ No newline at end of file
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
new file mode 100644
index 0000000..fecf2e8
--- /dev/null
+++ b/test/merge_csr_mv!.jl
@@ -0,0 +1,99 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+## NOTE: Sparse matrices are converted to dense form in the @test's
+##       considering that our redefinition of SparseArrays.mul! seems to
+##       interfere
+
+@testset "Extreme Cases" begin
+
+    @testset "Singleton" begin
+        A = sparse(reshape([1], 1, 1))
+
+        x = rand(1)
+
+        y = zeros(size(A, 1))
+
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
+
+        @test Matrix(A) * x == y
+    end
+
+    @testset "Single row" begin
+
+        A = SparseArrays.sprand(10, 1, 0.3)
+
+        x = rand(1:10, 1)
+
+        y = zeros(size(A, 1))
+
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint)
+
+        @test Matrix(A) * x ≈ y
+    end
+
+    @testset "Single column" begin
+
+        A = SparseArrays.sprand(10, 1, 0.3)
+
+        x = rand(1:10, 1)
+
+        y = zeros(size(A, 1))
+
+        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
+
+        @test Matrix(A) * x ≈ y
+    end
+end
+
+@testset "Square" begin
+    # 10 x 10 with 30% chance of entry being made
+    A = SparseArrays.sprand(10,10,0.3)
+
+    x = rand(10)
+
+    y = zeros(size(A, 1))
+
+    ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint)
+
+    @test (Matrix(A) * x) * 1.1 ≈ y
+
+end
+
+@testset "4x6" begin
+    # create matrix
+    m = [10 20 0 0 0 0;
+         0 30 0 40 0 0;
+         0 0 50 60 70 0;
+         0 0 0 0 0 80]
+
+    # get into CSR form
+    A = sparse(m)
+
+    # create vector
+    x = [5,2,3,1,8,2]
+
+    # create empty solution
+    y = zeros(Int64, size(A, 1))
+    
+    # multiply
+    ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint)
+
+    @test m * x * 2.0 == y
+end
+
+@testset "100x100" begin
+    # create matrix
+    A = SparseArrays.sprand(100, 100, 0.3)
+
+    # create vector
+    x = rand(1:100, 100, 1)
+
+    # create empty solution
+    y = zeros(size(A, 1))
+
+    ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose)
+
+    @test Matrix(A) * x * 3 ≈ y
+end
\ No newline at end of file
diff --git a/test/mul!.jl b/test/mul!.jl
new file mode 100644
index 0000000..5814afa
--- /dev/null
+++ b/test/mul!.jl
@@ -0,0 +1,27 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+@testset "atomic_add!" begin
+    # atomic add with real
+    @testset "Real" begin
+        a = rand(Float64, 20)
+        a_copy = deepcopy(a)
+
+        b = 1.25
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+
+    # atomic add with complex
+    @testset "Complex" begin
+        a = rand(Complex{Float64}, 20)
+        a_copy = deepcopy(a)
+
+        b = 2.5 + 0.8im
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index 32ab8cc..ecfe12b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -1,96 +1,13 @@
 using Test
-using ParallelMergeCSR
-using SparseArrays
-
-
-@testset "Extreme Cases" begin
-
-    @testset "Singleton" begin
-        A = sparse(reshape([1], 1, 1))
-
-        x = rand(1)
-
-        y = zeros(size(A, 1))
-
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
-
-        @test A * x == y
-    end
-
-    @testset "Single row" begin
-
-        A = SparseArrays.sprand(10, 1, 0.3)
-
-        x = rand(1:10, 1)
-
-        y = zeros(size(A, 1))
-
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint)
-
-        @test A * x ≈ y
-    end
-
-    @testset "Single column" begin
-
-        A = SparseArrays.sprand(10, 1, 0.3)
-
-        x = rand(1:10, 1)
-
-        y = zeros(size(A, 1))
-
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
-
-        @test A * x ≈ y
-    end
-end
-
-@testset "Square" begin
-    # 10 x 10 with 30% chance of entry being made
-    A = SparseArrays.sprand(10,10,0.3)
-
-    x = rand(10)
-
-    y = zeros(size(A, 1))
-
-    ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint)
-
-    @test (A * x) * 1.1 ≈ y
 
+@testset "merge_csr_mv!" begin
+    include("merge_csr_mv!.jl")
 end
 
-@testset "4x6" begin
-    # create matrix
-    m = [10 20 0 0 0 0;
-         0 30 0 40 0 0;
-         0 0 50 60 70 0;
-         0 0 0 0 0 80]
-
-    # get into CSR form
-    A = sparse(m)
-
-    # create vector
-    x = [5,2,3,1,8,2]
-
-    # create empty solution
-    y = zeros(Int64, size(A, 1))
-    
-    # multiply
-    ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint)
-
-    @test (A * x) * 2.0 == y
+@testset "atomic_add!" begin
+    include("atomic_add!.jl")
 end
 
-@testset "100x100" begin
-    # create matrix
-    A = SparseArrays.sprand(100, 100, 0.3)
-
-    # create vector
-    x = rand(1:100, 100, 1)
-
-    # create empty solution
-    y = zeros(size(A, 1))
-
-    ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose)
-
-    @test (A * x) * 3 ≈ y
+@testset "mul!" begin
+    include("mul!.jl")
 end
\ No newline at end of file

From f7f46b206d3897b3a182b3305c2cea879d19a120 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Thu, 29 Dec 2022 17:57:15 -0500
Subject: [PATCH 10/26] Fixed broken default matrix-matrix, matrix-vector mul,
 added simple tests

---
 src/ParallelMergeCSR.jl | 26 +++++++++++++------
 test/mul!.jl            | 57 +++++++++++++++++++++++++++++------------
 2 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index b0692d9..6360272 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -150,8 +150,14 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
         size(A, 1) == size(B, 1) || throw(DimensionMismatch())
         size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        # move multiplication by beta into the multithreaded part
-        ABα = similar(C)
+        if β != 1
+            # assume the rmul! intended to come from SparseArrays
+            ## (could implement a threaded version of that as well, maybe piggyback off of impl branch)
+            # preemptively handle β so we just handle C = ABα + C
+            β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+        end
+        # move multiplication by alpha into the multithreaded part
+        ABα = zeros(C)
         for (row_idx, col) in enumerate(eachcol(B))
             # merge_csr_mv!(A, x, β, y, op)
             merge_csr_mv!(A, col, α, ABα[row_idx, :], $t)
@@ -164,12 +170,10 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
 end
 
 
-# Throws error about @atomic behavior
-## Probably meant to add b to every element of a while mutating a?
 ## function names should be suffixed w/ exclamation mark
 function atomic_add!(a::AbstractVector{T},b::T) where T <: Real
     # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes
-    # you'd have to wrap each
+    # you'd have to wrap each element of the array with Atomic
     # Objective: for each element in AbstractVector, atomically add b to the element
     # Can also use Atomix.jl
     for idx in 1:length(a)
@@ -198,15 +202,21 @@ function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::De
     nzv = nonzeros(A)
     rv = rowvals(A)
     if β != 1
-        β != 0 ? rmul!(C, β) : fill!(C, zero(eltype(C)))
+        # assume the rmul! intended to come from SparseArrays
+        ## (could implement a threaded version of that as well, maybe piggyback off of impl branch)
+        # preemptively handle β so we just handle C = ABα + C
+        β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
     end
+    # iterate over the columns of C
     for k in 1:size(C, 2)
+        # iterate over columns of A
         @threads for col in 1:size(A, 2)
+            # multiply each element of B times alpha (single value)
             @inbounds αxj = B[col,k] * α
-            for j in nzrange(A, col)
+            for j in nzrange(A, col) # range of indices in nzv of A restricted to column
                 @inbounds val = nzv[j]*αxj
                 @inbounds row = rv[j]
-                @inbounds out = C[row:row, k] # pass element as view
+                @inbounds out = @view C[row:row, k]
                 atomic_add!(out,val) 
             end
         end
diff --git a/test/mul!.jl b/test/mul!.jl
index 5814afa..e7b27d7 100644
--- a/test/mul!.jl
+++ b/test/mul!.jl
@@ -2,26 +2,49 @@ using Test
 using ParallelMergeCSR
 using SparseArrays
 
-@testset "atomic_add!" begin
-    # atomic add with real
-    @testset "Real" begin
-        a = rand(Float64, 20)
-        a_copy = deepcopy(a)
+# trigger merge_csr_mv!
+@testset "Adjoint" begin
+    
+    # C = ABα + Cβ
+end
 
-        b = 1.25
+# trigger merge_csr_mv!
+@testset "Transpose" begin
 
-        ParallelMergeCSR.atomic_add!(a, b)
-        @test a == (a_copy .+ b)
-    end
+    # C = ABα + Cβ
+end
 
-    # atomic add with complex
-    @testset "Complex" begin
-        a = rand(Complex{Float64}, 20)
-        a_copy = deepcopy(a)
+@testset "Matrix x Matrix" begin
 
-        b = 2.5 + 0.8im
+    # C = ABα + Cβ
+    ## A should be CSC matrix, B should be Dense
+    ## α, β are `Number`s
+    A = sprand(5, 5, 0.2)
+    B = rand(5, 5)
+    α = 1.1
+
+    C = zeros(Float64, 5, 5)
+    C_copy = deepcopy(C)
+    β = 0.3
+
+    SparseArrays.mul!(C, A, B, α, β)
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Vector" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(5)
+    α = 2.5
+
+    C = zeros(Float64, size(B))
+    C_copy = deepcopy(C)
+    β = 5.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+    
 
-        ParallelMergeCSR.atomic_add!(a, b)
-        @test a == (a_copy .+ b)
-    end
 end
\ No newline at end of file

From 45478d1360840d66d72260c4836a149fb53404ab Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Fri, 30 Dec 2022 11:54:32 -0500
Subject: [PATCH 11/26] Added test for matrix x matrix operation using
 merge_csr_mv!

---
 test/merge_csr_mv!.jl | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index fecf2e8..241a258 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -96,4 +96,28 @@ end
     ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose)
 
     @test Matrix(A) * x * 3 ≈ y
+end
+
+@testset "Matrix x Matrix" begin
+
+    ## Calculate AXα
+    # Create Matrices
+    A = SparseArrays.sparse(rand(1:10, 3, 2))
+    X = rand(1:10, 2, 4)
+
+    # set alpha to 1.0 for now
+    α = 1.0
+
+    # Create place to store solution
+    Y = zeros(3, 4)
+
+    # iterate
+    for (idx, col) in enumerate(eachcol(X))
+        # merge_csr_mv!(A, x, β, y, op)
+        Y_view = @view Y[:, idx]
+        ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x)
+    end
+
+    @test Matrix(A) * X == Y 
+
 end
\ No newline at end of file

From 381437f2bc0a3b592895251f3e77e543540477fc Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Fri, 30 Dec 2022 11:55:29 -0500
Subject: [PATCH 12/26] Fixed issue with zeros, partial fix for AB\alpha
 operation inside mul! not mutating

---
 src/ParallelMergeCSR.jl | 10 ++++++----
 test/mul!.jl            | 28 +++++++++++++++++++++++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 6360272..59d82c8 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -157,12 +157,14 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
             β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
         # move multiplication by alpha into the multithreaded part
-        ABα = zeros(C)
-        for (row_idx, col) in enumerate(eachcol(B))
+        ABα = zeros(size(C))
+        for (col_idx, col) in enumerate(eachcol(B))
             # merge_csr_mv!(A, x, β, y, op)
-            merge_csr_mv!(A, col, α, ABα[row_idx, :], $t)
+            ABα_view = @view ABα[:, col_idx]
+            merge_csr_mv!(A, col, α, ABα_view, $t)
         end
-        C = ABα + C*β
+        # doesn't seem to mutate properly
+        C += ABα
         C
         # end of @eval macro
     end
diff --git a/test/mul!.jl b/test/mul!.jl
index e7b27d7..4520170 100644
--- a/test/mul!.jl
+++ b/test/mul!.jl
@@ -5,13 +5,35 @@ using SparseArrays
 # trigger merge_csr_mv!
 @testset "Adjoint" begin
     
-    # C = ABα + Cβ
+    # C = adjoint(A)Bα + Cβ
+    A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3))
+    B = rand(5,5)
+    α = 2.3
+
+    C = rand(Complex{Float64}, 5, 5)
+    C_copy = deepcopy(C)
+    β = 1.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
 end
 
 # trigger merge_csr_mv!
 @testset "Transpose" begin
 
-    # C = ABα + Cβ
+    # C = transpose(A)Bα + Cβ
+    A = transpose(sprand(4, 4, 0.3))
+    B = rand(1:10, (4,4))
+    α = 1.0
+
+    C = zeros(Float64, 4, 4)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
 end
 
 @testset "Matrix x Matrix" begin
@@ -23,7 +45,7 @@ end
     B = rand(5, 5)
     α = 1.1
 
-    C = zeros(Float64, 5, 5)
+    C = zeros(5, 5)
     C_copy = deepcopy(C)
     β = 0.3
 

From 84393bcb4b7b39d9bb60af9b775c4c855b8b0b13 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Tue, 3 Jan 2023 10:41:02 -0500
Subject: [PATCH 13/26] fixing issue with mul!

---
 src/ParallelMergeCSR.jl | 55 +++++++++++++++++++++++------------------
 1 file changed, 31 insertions(+), 24 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 59d82c8..11645e8 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -2,7 +2,11 @@ module ParallelMergeCSR
 
 using SparseArrays
 using SparseArrays: AbstractSparseMatrixCSC,
-                    DenseInputVecOrMat
+                    DenseInputVecOrMat,
+                    getcolptr,
+                    getrowval,
+                    getnzval
+
 using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
 
 using Base.Threads
@@ -75,53 +79,59 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR
 # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix
 # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place 
 # Axβ = y
-function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op)
+function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op)
 
     # transpose the CSC to CSR
     ## colptr in CSC equiv. to rowptr in CSR
     ## rowval in CSC equiv. to colval in CSR
     ## rows are now columns so the m x n dimensions after transpose are n x m
-    At = copy(transpose(A))
-    row_end_offsets = At.colptr[2:end]
 
-    nz = nonzeros(At)
-    nnz = length(nz) 
-    nrows = size(A, 1)
-    nz_indices = collect(1:length(nz)) # nzval ordering is diff for diff formats
-    num_merge_items = length(nz) + nrows # preserve the dimensions of the original matrix
+    # At = copy(transpose(A))
+    # row_end_offsets = At.colptr[2:end]
+
+    nzv = getnzval(A)
+    rv = getrowval(A)
+    cp = getcolptr(A)
+
+    nnz = length(nzv) 
+    nrows = length(cp) - 1
+
+    nz_indices = rv
+    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+    num_merge_items = length(nzv) + nrows # preserve the dimensions of the original matrix
 
     num_threads = nthreads()
     items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
 
-    row_carry_out = zeros(Int, num_threads)
-    value_carry_out = zeros(Float64, num_threads)
+    row_carry_out = zeros(eltype(cp), num_threads)
+    value_carry_out = zeros(eltype(output), num_threads) # value must match output
 
     # Julia threads start id by 1, so make sure to offset!
     @threads for tid in 1:num_threads
         diagonal = min(items_per_thread * (tid - 1), num_merge_items)
         diagonal_end = min(diagonal + items_per_thread, num_merge_items)
 
-        # Get starting and ending thread coordinates (row, nz)
+        # Get starting and ending thread coordinates (row, nzv)
         thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
         thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
 
         # Consume merge items, whole rows first
-        running_total = 0.0        
+        running_total = 0       
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]]
+                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            @inbounds output[thread_coord.x + 1] = running_total * β
-            running_total = 0.0
+            @inbounds output[thread_coord.x + 1] += α * running_total
+            running_total = 0
             thread_coord.x += 1 
         end
        
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            @inbounds running_total += op(nz[thread_coord.y + 1]) * input[At.rowval[thread_coord.y + 1]] * β
+            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] 
             thread_coord.y += 1
         end
 
@@ -133,7 +143,7 @@ function merge_csr_mv!(A::SparseMatrixCSC, input, β::Number, output, op)
 
     for tid in 1:num_threads
         @inbounds if row_carry_out[tid] <= nrows
-            @inbounds output[row_carry_out[tid]] += value_carry_out[tid]
+            @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid]
         end
     end
 
@@ -157,14 +167,11 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
             β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
         # move multiplication by alpha into the multithreaded part
-        ABα = zeros(size(C))
-        for (col_idx, col) in enumerate(eachcol(B))
+        for (col_idx, input) in enumerate(eachcol(B))
             # merge_csr_mv!(A, x, β, y, op)
-            ABα_view = @view ABα[:, col_idx]
-            merge_csr_mv!(A, col, α, ABα_view, $t)
+            output = @view C[:, col_idx]
+            merge_csr_mv!(α, A, input, output, $t)
         end
-        # doesn't seem to mutate properly
-        C += ABα
         C
         # end of @eval macro
     end

From 1a89711c16be2c93118d0dcb4a2d249abdd7e261 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 10:47:51 -0500
Subject: [PATCH 14/26] Try to use @view to fix issue with matrix not being
 mutated

---
 src/ParallelMergeCSR.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 59d82c8..5c3dc0e 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -164,7 +164,7 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
             merge_csr_mv!(A, col, α, ABα_view, $t)
         end
         # doesn't seem to mutate properly
-        C += ABα
+        C[:,:] += ABα
         C
         # end of @eval macro
     end

From b6c26ccbefa85cc52f12c78028e2a6aa9a6f7e90 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 10:50:42 -0500
Subject: [PATCH 15/26] Add tests for transpose on square and rectangular
 matrices

---
 test/mul!.jl | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/test/mul!.jl b/test/mul!.jl
index 4520170..16e3683 100644
--- a/test/mul!.jl
+++ b/test/mul!.jl
@@ -20,14 +20,31 @@ using SparseArrays
 end
 
 # trigger merge_csr_mv!
-@testset "Transpose" begin
+@testset "Transpose Square" begin
 
     # C = transpose(A)Bα + Cβ
-    A = transpose(sprand(4, 4, 0.3))
-    B = rand(1:10, (4,4))
+    A = transpose(sparse(rand(1:5, 4, 4)))
+    B = rand(1:5, (4,4))
     α = 1.0
 
-    C = zeros(Float64, 4, 4)
+    C = zeros(4, 4)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    # right hand side is correct, left hand side is problematic
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+end
+
+@testset "Transpose Rectangular" begin
+
+    A = sparse([1 2 3 4; 5 6 7 8]) |> transpose
+    B = [1 2 3; 4 5 6]
+
+    α = 1.0
+
+    C = zeros(4, 3)
     C_copy = deepcopy(C)
     β = 1.0
 

From 182aaa53967e114480e2fbdab0a2a452eefae32c Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 11:41:11 -0500
Subject: [PATCH 16/26] Changed unit tests to match new argument ordering

---
 test/merge_csr_mv!.jl | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 241a258..b01c67f 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -6,6 +6,9 @@ using SparseArrays
 ##       considering that our redefinition of SparseArrays.mul! seems to
 ##       interfere
 
+## New function signature for merge_csr_mv!, it is now
+## merge_csr_mv!(α, A, input, output, op)
+## so should be A*input*α = output with some op
 @testset "Extreme Cases" begin
 
     @testset "Singleton" begin
@@ -15,7 +18,7 @@ using SparseArrays
 
         y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
+        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose)
 
         @test Matrix(A) * x == y
     end
@@ -28,7 +31,7 @@ using SparseArrays
 
         y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, adjoint)
+        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, adjoint)
 
         @test Matrix(A) * x ≈ y
     end
@@ -41,7 +44,7 @@ using SparseArrays
 
         y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(A, x, 1.0, y, transpose)
+        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose)
 
         @test Matrix(A) * x ≈ y
     end
@@ -55,7 +58,7 @@ end
 
     y = zeros(size(A, 1))
 
-    ParallelMergeCSR.merge_csr_mv!(A, x, 1.1, y, adjoint)
+    ParallelMergeCSR.merge_csr_mv!(1.1, A, x, y, adjoint)
 
     @test (Matrix(A) * x) * 1.1 ≈ y
 
@@ -78,7 +81,8 @@ end
     y = zeros(Int64, size(A, 1))
     
     # multiply
-    ParallelMergeCSR.merge_csr_mv!(A, x, 2.0, y, adjoint)
+    ParallelMergeCSR.merge_csr_mv!(2.0, A, x, y, adjoint)
+
 
     @test m * x * 2.0 == y
 end
@@ -93,7 +97,7 @@ end
     # create empty solution
     y = zeros(size(A, 1))
 
-    ParallelMergeCSR.merge_csr_mv!(A, x, 3, y, transpose)
+    ParallelMergeCSR.merge_csr_mv!(3, A, x, y, transpose)
 
     @test Matrix(A) * x * 3 ≈ y
 end
@@ -115,7 +119,8 @@ end
     for (idx, col) in enumerate(eachcol(X))
         # merge_csr_mv!(A, x, β, y, op)
         Y_view = @view Y[:, idx]
-        ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x)
+        ParallelMergeCSR.merge_csr_mv!(α, A, col, Y_view, x -> x)
+        # ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x)
     end
 
     @test Matrix(A) * X == Y 

From 35af5feef9797f6c8ff51faa9f70371a65dbbcda Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Tue, 3 Jan 2023 12:06:50 -0500
Subject: [PATCH 17/26] adding StridedVector type

---
 src/ParallelMergeCSR.jl | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 11645e8..17900c6 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -79,7 +79,7 @@ we do it just so the INTERNAL representation of the matrix looks like a CSR
 # 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix
 # StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place 
 # Axβ = y
-function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op)
+function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
 
     # transpose the CSC to CSR
     ## colptr in CSC equiv. to rowptr in CSR
@@ -116,7 +116,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op)
         thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
 
         # Consume merge items, whole rows first
-        running_total = 0       
+        running_total = zero(eltype(output))
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
                 @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
@@ -124,7 +124,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op)
             end
 
             @inbounds output[thread_coord.x + 1] += α * running_total
-            running_total = 0
+            running_total = zero(eltype(output))
             thread_coord.x += 1 
         end
        

From 8cd636bc4956ddb883386e78a974a93aec1ac07f Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 14:40:45 -0500
Subject: [PATCH 18/26] fixed segfault, revised some tests to adjoint/transpose
 matrix v. plain matrix

---
 src/ParallelMergeCSR.jl |  4 +++-
 test/merge_csr_mv!.jl   | 39 ++++++++++++++++++++++++---------------
 2 files changed, 27 insertions(+), 16 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 11645e8..0026f4c 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -94,7 +94,9 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input, output, op)
     cp = getcolptr(A)
 
     nnz = length(nzv) 
-    nrows = length(cp) - 1
+    
+    # nrows = length(cp) - 1 can give the wrong number of rows!
+    nrows = A.m
 
     nz_indices = rv
     row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index b01c67f..7ae8fc6 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -18,49 +18,58 @@ using SparseArrays
 
         y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose)
+        ParallelMergeCSR.merge_csr_mv!(0.3, A, x, y, transpose)
 
-        @test Matrix(A) * x == y
+        @test Matrix(A) * x * 0.3 == y
     end
 
     @testset "Single row" begin
 
+        # 10 x 1 converted to 1 x 10
         A = SparseArrays.sprand(10, 1, 0.3)
+        A_transpose = copy(transpose(A))
 
-        x = rand(1:10, 1)
+        # x needs to be 10 x 1 
+        x = rand(size(A, 1))
 
-        y = zeros(size(A, 1))
+        y = zeros(eltype(x), 1)
 
-        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, adjoint)
+        ParallelMergeCSR.merge_csr_mv!(1.1, A_transpose, x, y, transpose)
 
-        @test Matrix(A) * x ≈ y
+        @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
     end
 
     @testset "Single column" begin
 
-        A = SparseArrays.sprand(10, 1, 0.3)
+        # 1 x 10 is now 10 x 1
+        A = SparseArrays.sprand(1, 10, 0.3)
+        A_adjoint = copy(adjoint(A))
 
-        x = rand(1:10, 1)
+        # needs to be 1 x 1
+        x = rand()
 
-        y = zeros(size(A, 1))
+        # 10 x 1
+        y = zeros(size(A_adjoint))
 
-        ParallelMergeCSR.merge_csr_mv!(1.0, A, x, y, transpose)
+        ParallelMergeCSR.merge_csr_mv!(0.7, A_adjoint, x, y, adjoint)
 
-        @test Matrix(A) * x ≈ y
+        @test adjoint(Matrix(A)) * x * 0.7 ≈ y
     end
 end
 
 @testset "Square" begin
     # 10 x 10 with 30% chance of entry being made
     A = SparseArrays.sprand(10,10,0.3)
+    A_adjoint = copy(adjoint(A))
 
+    # 10 x 1
     x = rand(10)
 
-    y = zeros(size(A, 1))
+    y = zeros(size(x))
 
-    ParallelMergeCSR.merge_csr_mv!(1.1, A, x, y, adjoint)
+    ParallelMergeCSR.merge_csr_mv!(1.1, A_adjoint, x, y, adjoint)
 
-    @test (Matrix(A) * x) * 1.1 ≈ y
+    @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y
 
 end
 
@@ -71,7 +80,7 @@ end
          0 0 50 60 70 0;
          0 0 0 0 0 80]
 
-    # get into CSR form
+    # get into CSC form
     A = sparse(m)
 
     # create vector

From 34eda6b3f482be1c577a87b6b0414235146480ad Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Tue, 3 Jan 2023 14:49:20 -0500
Subject: [PATCH 19/26] fixing imports

---
 src/ParallelMergeCSR.jl | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index a8871d8..f1a17ba 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -3,9 +3,7 @@ module ParallelMergeCSR
 using SparseArrays
 using SparseArrays: AbstractSparseMatrixCSC,
                     DenseInputVecOrMat,
-                    getcolptr,
-                    getrowval,
-                    getnzval
+                    getcolptr
 
 using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
 
@@ -89,8 +87,8 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
     # At = copy(transpose(A))
     # row_end_offsets = At.colptr[2:end]
 
-    nzv = getnzval(A)
-    rv = getrowval(A)
+    nzv = nonzeros(A)
+    rv = rowvals(A)
     cp = getcolptr(A)
 
     nnz = length(nzv) 

From 366d2e8e56dbf6ab8d048da1359c2f860036abd8 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Tue, 3 Jan 2023 15:44:41 -0500
Subject: [PATCH 20/26] rm inbounds no longer overloading SparseArray.mul!

Co-authored-by: John Long <johnzl@protonmail.ch>
---
 src/ParallelMergeCSR.jl | 12 +++++++-----
 test/merge_csr_mv!.jl   | 38 +++++++++++++++++++-------------------
 2 files changed, 26 insertions(+), 24 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index f1a17ba..9b7908b 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -94,7 +94,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
     nnz = length(nzv) 
     
     # nrows = length(cp) - 1 can give the wrong number of rows!
-    nrows = A.m
+    nrows = A.n
 
     nz_indices = rv
     row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
@@ -119,11 +119,13 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         running_total = zero(eltype(output))
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+                # @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+                running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            @inbounds output[thread_coord.x + 1] += α * running_total
+            # @inbounds output[thread_coord.x + 1] += α * running_total
+            output[thread_coord.x + 1] += α * running_total
             running_total = zero(eltype(output))
             thread_coord.x += 1 
         end
@@ -154,7 +156,7 @@ end
 # C = transpose(A)B + Cβ
 # C = xABα + Cβ
 for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
-    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+    @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
         # obtains the original matrix underneath the "lazy wrapper"
         A = xA.parent
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
@@ -204,7 +206,7 @@ function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
     end
 end
 
-function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
+function mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
     size(A, 2) == size(B, 1) || throw(DimensionMismatch())
     size(A, 1) == size(C, 1) || throw(DimensionMismatch())
     size(B, 2) == size(C, 2) || throw(DimensionMismatch())
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 7ae8fc6..42117da 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -1,5 +1,5 @@
 using Test
-using ParallelMergeCSR
+using ParallelMergeCSR: merge_csr_mv!
 using SparseArrays
 
 ## NOTE: Sparse matrices are converted to dense form in the @test's
@@ -18,7 +18,7 @@ using SparseArrays
 
         y = zeros(size(A, 1))
 
-        ParallelMergeCSR.merge_csr_mv!(0.3, A, x, y, transpose)
+        merge_csr_mv!(0.3, A, x, y, transpose)
 
         @test Matrix(A) * x * 0.3 == y
     end
@@ -27,14 +27,13 @@ using SparseArrays
 
         # 10 x 1 converted to 1 x 10
         A = SparseArrays.sprand(10, 1, 0.3)
-        A_transpose = copy(transpose(A))
 
         # x needs to be 10 x 1 
         x = rand(size(A, 1))
 
         y = zeros(eltype(x), 1)
 
-        ParallelMergeCSR.merge_csr_mv!(1.1, A_transpose, x, y, transpose)
+        merge_csr_mv!(1.1, A, x, y, transpose)
 
         @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
     end
@@ -43,31 +42,31 @@ using SparseArrays
 
         # 1 x 10 is now 10 x 1
         A = SparseArrays.sprand(1, 10, 0.3)
-        A_adjoint = copy(adjoint(A))
-
+        
         # needs to be 1 x 1
-        x = rand()
+        x = rand(1)
 
         # 10 x 1
-        y = zeros(size(A_adjoint))
+        y = zeros(size(A,2))
+        
+        merge_csr_mv!(0.7, A, x, y, adjoint)
 
-        ParallelMergeCSR.merge_csr_mv!(0.7, A_adjoint, x, y, adjoint)
+        y_exact = 0.7 * adjoint(Matrix(A)) * x
 
-        @test adjoint(Matrix(A)) * x * 0.7 ≈ y
+        @test y_exact ≈ y
     end
 end
 
 @testset "Square" begin
     # 10 x 10 with 30% chance of entry being made
     A = SparseArrays.sprand(10,10,0.3)
-    A_adjoint = copy(adjoint(A))
 
     # 10 x 1
     x = rand(10)
 
     y = zeros(size(x))
 
-    ParallelMergeCSR.merge_csr_mv!(1.1, A_adjoint, x, y, adjoint)
+    merge_csr_mv!(1.1, A, x, y, adjoint)
 
     @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y
 
@@ -84,16 +83,17 @@ end
     A = sparse(m)
 
     # create vector
-    x = [5,2,3,1,8,2]
+    # x = [5,2,3,1,8,2]
+    x = [5,2,3,1]
 
     # create empty solution
-    y = zeros(Int64, size(A, 1))
+    y = zeros(Int64, size(A, 2))
     
     # multiply
-    ParallelMergeCSR.merge_csr_mv!(2.0, A, x, y, adjoint)
+    merge_csr_mv!(2.0, A, x, y, adjoint)
 
 
-    @test m * x * 2.0 == y
+    @test transpose(m) * x * 2.0 == y
 end
 
 @testset "100x100" begin
@@ -106,7 +106,7 @@ end
     # create empty solution
     y = zeros(size(A, 1))
 
-    ParallelMergeCSR.merge_csr_mv!(3, A, x, y, transpose)
+    merge_csr_mv!(3, A, x, y, transpose)
 
     @test Matrix(A) * x * 3 ≈ y
 end
@@ -128,8 +128,8 @@ end
     for (idx, col) in enumerate(eachcol(X))
         # merge_csr_mv!(A, x, β, y, op)
         Y_view = @view Y[:, idx]
-        ParallelMergeCSR.merge_csr_mv!(α, A, col, Y_view, x -> x)
-        # ParallelMergeCSR.merge_csr_mv!(A, col, α, Y_view, x -> x)
+        merge_csr_mv!(α, A, col, Y_view, x -> x)
+        # merge_csr_mv!(A, col, α, Y_view, x -> x)
     end
 
     @test Matrix(A) * X == Y 

From 778d524cf8c6b7d90f68425a4f60331a2910d18b Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 17:48:35 -0500
Subject: [PATCH 21/26] Fixed issue with incorrect output and updated tests to
 always use transposed/adjointed matrices

---
 src/ParallelMergeCSR.jl | 11 +++--------
 test/merge_csr_mv!.jl   | 34 +++++++++++++++++-----------------
 2 files changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 9b7908b..416bd91 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -84,9 +84,6 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
     ## rowval in CSC equiv. to colval in CSR
     ## rows are now columns so the m x n dimensions after transpose are n x m
 
-    # At = copy(transpose(A))
-    # row_end_offsets = At.colptr[2:end]
-
     nzv = nonzeros(A)
     rv = rowvals(A)
     cp = getcolptr(A)
@@ -96,9 +93,9 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
     # nrows = length(cp) - 1 can give the wrong number of rows!
     nrows = A.n
 
-    nz_indices = rv
+    nz_indices = collect(1:nnz)
     row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
-    num_merge_items = length(nzv) + nrows # preserve the dimensions of the original matrix
+    num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
 
     num_threads = nthreads()
     items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
@@ -119,12 +116,10 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         running_total = zero(eltype(output))
         while thread_coord.x < thread_coord_end.x
             while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                # @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            # @inbounds output[thread_coord.x + 1] += α * running_total
             output[thread_coord.x + 1] += α * running_total
             running_total = zero(eltype(output))
             thread_coord.x += 1 
@@ -133,7 +128,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] 
+            running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
             thread_coord.y += 1
         end
 
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 42117da..5fc5ec9 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -5,10 +5,6 @@ using SparseArrays
 ## NOTE: Sparse matrices are converted to dense form in the @test's
 ##       considering that our redefinition of SparseArrays.mul! seems to
 ##       interfere
-
-## New function signature for merge_csr_mv!, it is now
-## merge_csr_mv!(α, A, input, output, op)
-## so should be A*input*α = output with some op
 @testset "Extreme Cases" begin
 
     @testset "Singleton" begin
@@ -57,6 +53,7 @@ using SparseArrays
     end
 end
 
+# test fails, y seems to have lots of zero-entries
 @testset "Square" begin
     # 10 x 10 with 30% chance of entry being made
     A = SparseArrays.sprand(10,10,0.3)
@@ -64,6 +61,7 @@ end
     # 10 x 1
     x = rand(10)
 
+    # 10 x 1
     y = zeros(size(x))
 
     merge_csr_mv!(1.1, A, x, y, adjoint)
@@ -93,7 +91,7 @@ end
     merge_csr_mv!(2.0, A, x, y, adjoint)
 
 
-    @test transpose(m) * x * 2.0 == y
+    @test Matrix(adjoint(m)) * x * 2.0 == y
 end
 
 @testset "100x100" begin
@@ -101,37 +99,39 @@ end
     A = SparseArrays.sprand(100, 100, 0.3)
 
     # create vector
-    x = rand(1:100, 100, 1)
+    x = rand(100)
 
     # create empty solution
     y = zeros(size(A, 1))
 
-    merge_csr_mv!(3, A, x, y, transpose)
+    merge_csr_mv!(3.0, A, x, y, transpose)
 
-    @test Matrix(A) * x * 3 ≈ y
+    @test transpose(Matrix(A)) * x * 3 ≈ y
 end
 
 @testset "Matrix x Matrix" begin
 
     ## Calculate AXα
     # Create Matrices
-    A = SparseArrays.sparse(rand(1:10, 3, 2))
-    X = rand(1:10, 2, 4)
+    # 3 x 2
+    A = sprand(3, 2, 0.4)
+    # 3 x 4
+    X = rand(3, 4)
 
     # set alpha to 1.0 for now
-    α = 1.0
+    α = 1.0 
 
-    # Create place to store solution
-    Y = zeros(3, 4)
+    # Create place to store solution, 
+    # should be 2 x 4
+    Y = zeros(2, 4)
 
     # iterate
+    # some kind of indexing issue going on here
     for (idx, col) in enumerate(eachcol(X))
-        # merge_csr_mv!(A, x, β, y, op)
         Y_view = @view Y[:, idx]
-        merge_csr_mv!(α, A, col, Y_view, x -> x)
-        # merge_csr_mv!(A, col, α, Y_view, x -> x)
+        merge_csr_mv!(α, A, col, Y_view, transpose)
     end
 
-    @test Matrix(A) * X == Y 
+    @test transpose(Matrix(A)) * X * 1.0 ≈ Y 
 
 end
\ No newline at end of file

From 938ecd76162696f50fcbb3ebc10d7a3da75dec6e Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Tue, 3 Jan 2023 18:12:54 -0500
Subject: [PATCH 22/26] Add @inbounds

---
 src/ParallelMergeCSR.jl | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 416bd91..06c9ef6 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -115,12 +115,12 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         # Consume merge items, whole rows first
         running_total = zero(eltype(output))
         while thread_coord.x < thread_coord_end.x
-            while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
+                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
-            output[thread_coord.x + 1] += α * running_total
+            @inbounds output[thread_coord.x + 1] += α * running_total
             running_total = zero(eltype(output))
             thread_coord.x += 1 
         end
@@ -128,13 +128,13 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         # May have thread end up partially consuming a row.
         # Save result form partial consumption and do one pass at the end to add it back to y
         while thread_coord.y < thread_coord_end.y
-            running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
             thread_coord.y += 1
         end
 
         # Save carry-outs
-        row_carry_out[tid] = thread_coord_end.x + 1
-        value_carry_out[tid] = running_total
+        @inbounds row_carry_out[tid] = thread_coord_end.x + 1
+        @inbounds value_carry_out[tid] = running_total
 
     end
 

From c9b8cccc72dd8a47374ee6f96d70910daeef0145 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Wed, 4 Jan 2023 10:24:29 -0500
Subject: [PATCH 23/26] Removed unnecessary comments, added SparseArrays
 definition override

---
 src/ParallelMergeCSR.jl | 54 ++++-------------------------------------
 1 file changed, 5 insertions(+), 49 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 06c9ef6..5e777bb 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -45,45 +45,10 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
 
 end
 
-#=
-Algorithm originally designed for CSR now needs to work for CSC
-
-CSC has fields:
-* m - number of rows
-* n - number of columns
-* colptr::Vector - column j is in colptr[j]:(colptr[j+1]-1)
-* rowval::Vector - row indices of stored values
-* nzval::Vector  - stored values, typically nonzeros
-
-CSR used to have fields: 
-* m - number of rows
-* n - number of columns
-* rowptr::Vector - row i is in rowptr[i]:(rowptr[i+1]-1)
-* colval::Vector - col indices of stored values
-* nzval::Vector - Stored values, typically non-zeros
-
-Given a matrix A in CSC, if you do transpose(A), indexing
-individual elements is fine but it's a lazy transpose.
-To materialize the changes, probably requires you perform some copy operation
-e.g. can see changes via dump(copy(transpose(m))) where m isa AbstractMatrixCSC
-
-* NOTE: we don't want the transpose to actually change the matrix dimensions,
-we do it just so the INTERNAL representation of the matrix looks like a CSR
-=#
-
-
-# TODO: 
-# 1. add @inbounds to remove bounds check
-# 2. add update tests to work with CSC matrices. you can use sprand(...) to generate some matrix
-# StridedVector is too restrictive, not even sure how you end up with a StridedVector in the first place 
-# Axβ = y
+# Internally treats the matrix as if its been transposed/is an adjoint
+# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
 function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
 
-    # transpose the CSC to CSR
-    ## colptr in CSC equiv. to rowptr in CSR
-    ## rowval in CSC equiv. to colval in CSR
-    ## rows are now columns so the m x n dimensions after transpose are n x m
-
     nzv = nonzeros(A)
     rv = rowvals(A)
     cp = getcolptr(A)
@@ -151,21 +116,17 @@ end
 # C = transpose(A)B + Cβ
 # C = xABα + Cβ
 for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
-    @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
         # obtains the original matrix underneath the "lazy wrapper"
         A = xA.parent
         size(A, 2) == size(C, 1) || throw(DimensionMismatch())
         size(A, 1) == size(B, 1) || throw(DimensionMismatch())
         size(B, 2) == size(C, 2) || throw(DimensionMismatch())
         if β != 1
-            # assume the rmul! intended to come from SparseArrays
-            ## (could implement a threaded version of that as well, maybe piggyback off of impl branch)
             # preemptively handle β so we just handle C = ABα + C
             β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
         end
-        # move multiplication by alpha into the multithreaded part
         for (col_idx, input) in enumerate(eachcol(B))
-            # merge_csr_mv!(A, x, β, y, op)
             output = @view C[:, col_idx]
             merge_csr_mv!(α, A, input, output, $t)
         end
@@ -201,25 +162,20 @@ function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
     end
 end
 
-function mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
+function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
     size(A, 2) == size(B, 1) || throw(DimensionMismatch())
     size(A, 1) == size(C, 1) || throw(DimensionMismatch())
     size(B, 2) == size(C, 2) || throw(DimensionMismatch())
     nzv = nonzeros(A)
     rv = rowvals(A)
     if β != 1
-        # assume the rmul! intended to come from SparseArrays
-        ## (could implement a threaded version of that as well, maybe piggyback off of impl branch)
         # preemptively handle β so we just handle C = ABα + C
         β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
     end
-    # iterate over the columns of C
     for k in 1:size(C, 2)
-        # iterate over columns of A
         @threads for col in 1:size(A, 2)
-            # multiply each element of B times alpha (single value)
             @inbounds αxj = B[col,k] * α
-            for j in nzrange(A, col) # range of indices in nzv of A restricted to column
+            for j in nzrange(A, col)
                 @inbounds val = nzv[j]*αxj
                 @inbounds row = rv[j]
                 @inbounds out = @view C[row:row, k]

From 5217a3cdf43aa5dd5c088f45838c54b578cfea95 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Wed, 4 Jan 2023 12:33:55 -0500
Subject: [PATCH 24/26] Reordered imports to alphabetical order

---
 src/ParallelMergeCSR.jl | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 5e777bb..1a74996 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -1,16 +1,13 @@
 module ParallelMergeCSR
 
+using Atomix
+using Base.Threads
 using SparseArrays
+using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
 using SparseArrays: AbstractSparseMatrixCSC,
                     DenseInputVecOrMat,
                     getcolptr
 
-using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
-
-using Base.Threads
-
-using Atomix
-
 mutable struct Coordinate
     x::Int
     y::Int

From b033e3dcbc9f57cda21b6d82c4e0e18e407097e5 Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Wed, 4 Jan 2023 12:34:39 -0500
Subject: [PATCH 25/26] added complex-valued unit tests for merge_csr_mv,
 removed incompatible tests

---
 test/merge_csr_mv!.jl | 152 +++++++++++++++++++++++++++++++++---------
 1 file changed, 120 insertions(+), 32 deletions(-)

diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 5fc5ec9..0eda52c 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -7,7 +7,7 @@ using SparseArrays
 ##       interfere
 @testset "Extreme Cases" begin
 
-    @testset "Singleton" begin
+    @testset "Singleton (Real)" begin
         A = sparse(reshape([1], 1, 1))
 
         x = rand(1)
@@ -19,43 +19,63 @@ using SparseArrays
         @test Matrix(A) * x * 0.3 == y
     end
 
-    @testset "Single row" begin
+    @testset "Singleton (Complex)" begin
+        A = sparse(reshape([1.0+2.5im], 1, 1))
+
+        x = rand(1)
+
+        y = zeros(eltype(A), size(A, 1))
+
+        merge_csr_mv!(10.1, A, x, y, adjoint)
+
+        @test adjoint(Matrix(A)) * x * 10.1 == y
+    end
+
+    @testset "Single row (Real)" begin
 
         # 10 x 1 converted to 1 x 10
-        A = SparseArrays.sprand(10, 1, 0.3)
+        A = 10.0 * SparseArrays.sprand(10, 1, 0.3)
 
         # x needs to be 10 x 1 
         x = rand(size(A, 1))
 
-        y = zeros(eltype(x), 1)
+        y = zeros(eltype(A), 1)
 
         merge_csr_mv!(1.1, A, x, y, transpose)
 
         @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
     end
 
-    @testset "Single column" begin
+    @testset "Single row (Complex)" begin
 
-        # 1 x 10 is now 10 x 1
-        A = SparseArrays.sprand(1, 10, 0.3)
-        
-        # needs to be 1 x 1
-        x = rand(1)
+        # 10 x 1 is treated as 1 x 10 inside merge_csr_mv!
+        A = SparseArrays.sprand(Complex{Float64}, 10, 1, 0.3)
 
-        # 10 x 1
-        y = zeros(size(A,2))
-        
-        merge_csr_mv!(0.7, A, x, y, adjoint)
+        x = rand(eltype(A), size(A, 1))
 
-        y_exact = 0.7 * adjoint(Matrix(A)) * x
+        y = zeros(eltype(A), 1)
 
-        @test y_exact ≈ y
+        merge_csr_mv!(1.1, A, x, y, transpose)
+
+        @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
     end
+
+
+    #= 
+        Single Column test wouldn't work because 
+        merge_csr_mv! is designed to handle Ax = y where
+        A is m x n, x is n x 1, and y must be m x 1.
+
+        Given a single column A that is 1 x z, merge_csr_mv! treats it as 
+        z x 1, which means y is constrained to 1 x 1 and the correct
+        output would be z x 1, which merge_csr_mv! is not designed for. 
+
+        On the other hand, the overridden SparseArrays.mul! should work
+    =#
 end
 
 # test fails, y seems to have lots of zero-entries
-@testset "Square" begin
-    # 10 x 10 with 30% chance of entry being made
+@testset "Square (Real)" begin
     A = SparseArrays.sprand(10,10,0.3)
 
     # 10 x 1
@@ -70,8 +90,20 @@ end
 
 end
 
-@testset "4x6" begin
-    # create matrix
+@testset "Square (Complex)" begin
+    A = SparseArrays.sprand(Complex{Float64}, 10, 10, 0.3)
+
+    x = 10 * rand(Complex{Float64}, 10)
+
+    y = zeros(eltype(A), size(x))
+
+    merge_csr_mv!(1.1, A, x, y, adjoint)
+
+    @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y
+end
+
+@testset "4x6 (Real)" begin
+    # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix
     m = [10 20 0 0 0 0;
          0 30 0 40 0 0;
          0 0 50 60 70 0;
@@ -81,7 +113,6 @@ end
     A = sparse(m)
 
     # create vector
-    # x = [5,2,3,1,8,2]
     x = [5,2,3,1]
 
     # create empty solution
@@ -94,7 +125,30 @@ end
     @test Matrix(adjoint(m)) * x * 2.0 == y
 end
 
-@testset "100x100" begin
+@testset "4 x 6 (Complex)" begin
+    # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix
+    m = [10 20 0 0 0 0;
+    0 30 0 40 0 0;
+    0 0 50 60 70 0;
+    0 0 0 0 0 80]
+
+    # get into CSC form
+    A = sparse(m)
+
+    # create vector
+    x = 22.1 * rand(Complex{Float64}, 4)
+
+    # create empty solution
+    y = zeros(eltype(x), size(A, 2))
+
+    # multiply
+    merge_csr_mv!(2.0, A, x, y, adjoint)
+
+    @test Matrix(adjoint(m)) * x * 2.0 == y
+
+end
+
+@testset "100x100 (Real)" begin
     # create matrix
     A = SparseArrays.sprand(100, 100, 0.3)
 
@@ -109,29 +163,63 @@ end
     @test transpose(Matrix(A)) * x * 3 ≈ y
 end
 
-@testset "Matrix x Matrix" begin
+@testset "100x100 (Complex)" begin
+    # create matrix
+    A = SparseArrays.sprand(Complex{Float64}, 100, 100, 0.3)
+
+    # create vector
+    x = rand(Complex{Float64}, 100)
+
+    # create empty solution
+    y = zeros(eltype(x), size(A, 1))
+
+    merge_csr_mv!(3.0, A, x, y, transpose)
+
+    @test transpose(Matrix(A)) * x * 3 ≈ y
+end
+
+#=
+    NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul!
+    should be the preferred method to do Matrix-Matrix multiplication.
+=#
+@testset "Matrix-Matrix (Real)" begin
 
-    ## Calculate AXα
     # Create Matrices
-    # 3 x 2
+    # 3 x 2 (treated as 2 x 3 in merge_csr_mv!)
     A = sprand(3, 2, 0.4)
     # 3 x 4
     X = rand(3, 4)
 
-    # set alpha to 1.0 for now
-    α = 1.0 
+    α = 9.2
 
-    # Create place to store solution, 
-    # should be 2 x 4
     Y = zeros(2, 4)
 
-    # iterate
-    # some kind of indexing issue going on here
     for (idx, col) in enumerate(eachcol(X))
         Y_view = @view Y[:, idx]
         merge_csr_mv!(α, A, col, Y_view, transpose)
     end
 
-    @test transpose(Matrix(A)) * X * 1.0 ≈ Y 
+    @test transpose(Matrix(A)) * X * 9.2 ≈ Y 
+
+end
+
+@testset "Matrix-Matrix (Real)" begin
+
+    # Create Matrices
+    # 3 x 2 (treated as 2 x 3 in merge_csr_mv!)
+    A = sprand(Complex{Float64}, 3, 2, 0.4)
+    # 3 x 4
+    X = rand(3, 4)
+
+    α = 9.2
+
+    Y = zeros(eltype(A), 2, 4)
+
+    for (idx, col) in enumerate(eachcol(X))
+        Y_view = @view Y[:, idx]
+        merge_csr_mv!(α, A, col, Y_view, adjoint)
+    end
+
+    @test adjoint(Matrix(A)) * X * 9.2 ≈ Y 
 
 end
\ No newline at end of file

From 87852a68e2dec0f993aff1c0944ebe223eb288cb Mon Sep 17 00:00:00 2001
From: John Long <johnzl@protonmail.ch>
Date: Wed, 4 Jan 2023 14:34:29 -0500
Subject: [PATCH 26/26] Added test for complex and real-valued
 matrices/vectors/coefficients in mul

---
 test/mul!.jl | 127 +++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 112 insertions(+), 15 deletions(-)

diff --git a/test/mul!.jl b/test/mul!.jl
index 16e3683..5b22b6a 100644
--- a/test/mul!.jl
+++ b/test/mul!.jl
@@ -2,8 +2,25 @@ using Test
 using ParallelMergeCSR
 using SparseArrays
 
-# trigger merge_csr_mv!
-@testset "Adjoint" begin
+# trigger merge_csr_mv! in this repo, does not default to mul! somewhere else
+@testset "Adjoint Real" begin
+
+    A = adjoint(sprand(5, 5, 0.3))    
+
+    B = rand(5, 5)
+    α = 11.2
+
+    C = rand(5, 5)
+    C_copy = deepcopy(C)
+    β = 3.9
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Adjoint Complex" begin
     
     # C = adjoint(A)Bα + Cβ
     A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3))
@@ -17,10 +34,11 @@ using SparseArrays
     SparseArrays.mul!(C, A, B, α, β)
 
     @test C ≈ Matrix(A) * B * α + C_copy * β
-end
 
+end
 # trigger merge_csr_mv!
-@testset "Transpose Square" begin
+
+@testset "Transpose Square Real" begin
 
     # C = transpose(A)Bα + Cβ
     A = transpose(sparse(rand(1:5, 4, 4)))
@@ -35,29 +53,62 @@ end
 
     # right hand side is correct, left hand side is problematic
     @test C ≈ Matrix(A) * B * α + C_copy * β
-end
 
-@testset "Transpose Rectangular" begin
+end
 
-    A = sparse([1 2 3 4; 5 6 7 8]) |> transpose
-    B = [1 2 3; 4 5 6]
 
+@testset "Transpose Square Complex" begin
+    # C = transpose(A)Bα + Cβ
+    A = transpose(sparse(rand(Complex{Float64}, 4, 4)))
+    B = rand(4,4)
     α = 1.0
 
-    C = zeros(4, 3)
+    C = zeros(eltype(A), 4, 4)
+    C_copy = deepcopy(C)
+    β = 6.11 + 9.2im
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+end
+
+@testset "Transpose Rectangular Real" begin
+
+    A = sparse(rand(4, 2)) |> transpose
+    B = rand(4, 3)
+
+    α = 9.1
+
+    C = zeros(eltype(A), 2, 3)
     C_copy = deepcopy(C)
     β = 1.0
 
     SparseArrays.mul!(C, A, B, α, β)
 
     @test C ≈ Matrix(A) * B * α + C_copy * β
+
 end
 
-@testset "Matrix x Matrix" begin
+@testset "Transpose Rectangular Complex" begin
+
+    A = sparse(rand(Complex{Float64}, 4, 2)) |> transpose
+    B = rand(4, 3)
+
+    α = 0.0 + 5.3im
+
+    C = zeros(eltype(A), 2, 3)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
 
-    # C = ABα + Cβ
-    ## A should be CSC matrix, B should be Dense
-    ## α, β are `Number`s
+@testset "Matrix x Matrix (Real)" begin
+
+    # A should be sparse, B should be dense
     A = sprand(5, 5, 0.2)
     B = rand(5, 5)
     α = 1.1
@@ -71,7 +122,23 @@ end
 
 end
 
-@testset "Matrix x Vector" begin
+@testset "Matrix x Matrix (Complex)" begin
+
+    # A should be sparse, B should be dense
+    A = sprand(5, 5, 0.2)
+    B = rand(Complex{Float64}, 5, 5)
+    α = 1.1
+
+    C = zeros(eltype(B), 5, 5)
+    C_copy = deepcopy(C)
+    β = 0.3
+
+    SparseArrays.mul!(C, A, B, α, β)
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Vector (Real)" begin
 
     A = sprand(5, 5, 0.3)
     B = rand(5)
@@ -84,6 +151,36 @@ end
     SparseArrays.mul!(C, A, B, α, β)
 
     @test C ≈ Matrix(A) * B * α + C_copy * β
-    
+end
+
+@testset "Matrix x Vector (Real)" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(5)
+    α = 2.5
+
+    C = zeros(Float64, size(B))
+    C_copy = deepcopy(C)
+    β = 5.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Vector (Complex)" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(Complex{Float64}, 5)
+    α = 2.5
+
+    C = zeros(eltype(B), size(B))
+    C_copy = deepcopy(C)
+    β = 2.1+0.1im
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
 
 end
\ No newline at end of file