diff --git a/Project.toml b/Project.toml
index c9d8f73..f1a521d 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,2 +1,14 @@
+name = "ParallelMergeCSR"
+uuid = "c4b929cc-6141-4ba4-8849-280a863346e1"
+version = "0.1.0"
+
 [deps]
+Atomix = "a9b6321e-bd34-4604-b9c9-b65b8de01458"
+LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
+
+[extras]
+Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+
+[targets]
+test = ["Test"]
diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index ca33a3a..1a74996 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -1,13 +1,187 @@
 module ParallelMergeCSR
 
-
+using Atomix
+using Base.Threads
 using SparseArrays
+using LinearAlgebra: Adjoint,adjoint,Transpose,transpose
+using SparseArrays: AbstractSparseMatrixCSC,
+                    DenseInputVecOrMat,
+                    getcolptr
+
+mutable struct Coordinate
+    x::Int
+    y::Int
+end
+
+# Want to give the illusion that we're still using 0-based coordinates when the actual
+# values themselves are 1-based
+# 
+# diagonal -> i + j = k 
+# a_len and b_len can stay the same from the paper
+# 
+# a -> row-end offsets so really pass in a[2:end]
+# b -> "natural" numbers
+function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
+    # Diagonal search range (in x coordinate space)
+    x_min = max(diagonal - b_len, 0)
+    x_max = min(diagonal, a_len)
+    # 2D binary-search along diagonal search range
+    while (x_min < x_max)
+        pivot = (x_min + x_max) >> 1
+        if (a[pivot + 1] <= b[diagonal - pivot])
+            x_min = pivot + 1
+        else
+            x_max = pivot
+        end
+    end
+
+    return Coordinate(
+        min(x_min, a_len),
+        diagonal - x_min
+    )
+
+end
+
+# Internally treats the matrix as if its been transposed/is an adjoint
+# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
+function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    cp = getcolptr(A)
+
+    nnz = length(nzv) 
+    
+    # nrows = length(cp) - 1 can give the wrong number of rows!
+    nrows = A.n
+
+    nz_indices = collect(1:nnz)
+    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+    num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
+
+    num_threads = nthreads()
+    items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
+
+    row_carry_out = zeros(eltype(cp), num_threads)
+    value_carry_out = zeros(eltype(output), num_threads) # value must match output
+
+    # Julia threads start id by 1, so make sure to offset!
+    @threads for tid in 1:num_threads
+        diagonal = min(items_per_thread * (tid - 1), num_merge_items)
+        diagonal_end = min(diagonal + items_per_thread, num_merge_items)
+
+        # Get starting and ending thread coordinates (row, nzv)
+        thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
+
+        # Consume merge items, whole rows first
+        running_total = zero(eltype(output))
+        while thread_coord.x < thread_coord_end.x
+            @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
+                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+                thread_coord.y += 1
+            end
+
+            @inbounds output[thread_coord.x + 1] += α * running_total
+            running_total = zero(eltype(output))
+            thread_coord.x += 1 
+        end
+       
+        # May have thread end up partially consuming a row.
+        # Save result form partial consumption and do one pass at the end to add it back to y
+        while thread_coord.y < thread_coord_end.y
+            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            thread_coord.y += 1
+        end
+
+        # Save carry-outs
+        @inbounds row_carry_out[tid] = thread_coord_end.x + 1
+        @inbounds value_carry_out[tid] = running_total
+
+    end
+
+    for tid in 1:num_threads
+        @inbounds if row_carry_out[tid] <= nrows
+            @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid]
+        end
+    end
+
+end
+
+
+# C = adjoint(A)Bα + Cβ
+# C = transpose(A)B + Cβ
+# C = xABα + Cβ
+for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
+    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+        # obtains the original matrix underneath the "lazy wrapper"
+        A = xA.parent
+        size(A, 2) == size(C, 1) || throw(DimensionMismatch())
+        size(A, 1) == size(B, 1) || throw(DimensionMismatch())
+        size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+        if β != 1
+            # preemptively handle β so we just handle C = ABα + C
+            β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+        end
+        for (col_idx, input) in enumerate(eachcol(B))
+            output = @view C[:, col_idx]
+            merge_csr_mv!(α, A, input, output, $t)
+        end
+        C
+        # end of @eval macro
+    end
+    # end of for loop
+end
+
+
+## function names should be suffixed w/ exclamation mark
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Real
+    # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes
+    # you'd have to wrap each element of the array with Atomic
+    # Objective: for each element in AbstractVector, atomically add b to the element
+    # Can also use Atomix.jl
+    for idx in 1:length(a)
+        Atomix.@atomic a[idx] += b
+    end
+    
+end
 
-include("types.jl")
-include("threaded.jl")
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
+    # return type representing real part
+    view_a = reinterpret(reshape, real(T),a)
+    real_b = real(b)
+    imag_b = imag(b)
 
-# include("gpu.jl")
-# include("distributed.jl")
+    # mutating the view also mutates the underlying array it came from
+    for idx in 1:size(view_a, 2)
+        Atomix.@atomic view_a[1, idx] += real_b
+        Atomix.@atomic view_a[2, idx] += imag_b
+    end
+end
 
+function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    size(A, 1) == size(C, 1) || throw(DimensionMismatch())
+    size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    if β != 1
+        # preemptively handle β so we just handle C = ABα + C
+        β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+    end
+    for k in 1:size(C, 2)
+        @threads for col in 1:size(A, 2)
+            @inbounds αxj = B[col,k] * α
+            for j in nzrange(A, col)
+                @inbounds val = nzv[j]*αxj
+                @inbounds row = rv[j]
+                @inbounds out = @view C[row:row, k]
+                atomic_add!(out,val) 
+            end
+        end
+    end
+    C
+end
 
+# end of the module
 end
\ No newline at end of file
diff --git a/src/threaded.jl b/src/threaded.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/src/types.jl b/src/types.jl
deleted file mode 100644
index e69de29..0000000
diff --git a/test/atomic_add!.jl b/test/atomic_add!.jl
new file mode 100644
index 0000000..5814afa
--- /dev/null
+++ b/test/atomic_add!.jl
@@ -0,0 +1,27 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+@testset "atomic_add!" begin
+    # atomic add with real
+    @testset "Real" begin
+        a = rand(Float64, 20)
+        a_copy = deepcopy(a)
+
+        b = 1.25
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+
+    # atomic add with complex
+    @testset "Complex" begin
+        a = rand(Complex{Float64}, 20)
+        a_copy = deepcopy(a)
+
+        b = 2.5 + 0.8im
+
+        ParallelMergeCSR.atomic_add!(a, b)
+        @test a == (a_copy .+ b)
+    end
+end
\ No newline at end of file
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
new file mode 100644
index 0000000..0eda52c
--- /dev/null
+++ b/test/merge_csr_mv!.jl
@@ -0,0 +1,225 @@
+using Test
+using ParallelMergeCSR: merge_csr_mv!
+using SparseArrays
+
+## NOTE: Sparse matrices are converted to dense form in the @test's
+##       considering that our redefinition of SparseArrays.mul! seems to
+##       interfere
+@testset "Extreme Cases" begin
+
+    @testset "Singleton (Real)" begin
+        A = sparse(reshape([1], 1, 1))
+
+        x = rand(1)
+
+        y = zeros(size(A, 1))
+
+        merge_csr_mv!(0.3, A, x, y, transpose)
+
+        @test Matrix(A) * x * 0.3 == y
+    end
+
+    @testset "Singleton (Complex)" begin
+        A = sparse(reshape([1.0+2.5im], 1, 1))
+
+        x = rand(1)
+
+        y = zeros(eltype(A), size(A, 1))
+
+        merge_csr_mv!(10.1, A, x, y, adjoint)
+
+        @test adjoint(Matrix(A)) * x * 10.1 == y
+    end
+
+    @testset "Single row (Real)" begin
+
+        # 10 x 1 converted to 1 x 10
+        A = 10.0 * SparseArrays.sprand(10, 1, 0.3)
+
+        # x needs to be 10 x 1 
+        x = rand(size(A, 1))
+
+        y = zeros(eltype(A), 1)
+
+        merge_csr_mv!(1.1, A, x, y, transpose)
+
+        @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
+    end
+
+    @testset "Single row (Complex)" begin
+
+        # 10 x 1 is treated as 1 x 10 inside merge_csr_mv!
+        A = SparseArrays.sprand(Complex{Float64}, 10, 1, 0.3)
+
+        x = rand(eltype(A), size(A, 1))
+
+        y = zeros(eltype(A), 1)
+
+        merge_csr_mv!(1.1, A, x, y, transpose)
+
+        @test (transpose(Matrix(A)) * x) * 1.1 ≈ y
+    end
+
+
+    #= 
+        Single Column test wouldn't work because 
+        merge_csr_mv! is designed to handle Ax = y where
+        A is m x n, x is n x 1, and y must be m x 1.
+
+        Given a single column A that is 1 x z, merge_csr_mv! treats it as 
+        z x 1, which means y is constrained to 1 x 1 and the correct
+        output would be z x 1, which merge_csr_mv! is not designed for. 
+
+        On the other hand, the overridden SparseArrays.mul! should work
+    =#
+end
+
+# test fails, y seems to have lots of zero-entries
+@testset "Square (Real)" begin
+    A = SparseArrays.sprand(10,10,0.3)
+
+    # 10 x 1
+    x = rand(10)
+
+    # 10 x 1
+    y = zeros(size(x))
+
+    merge_csr_mv!(1.1, A, x, y, adjoint)
+
+    @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y
+
+end
+
+@testset "Square (Complex)" begin
+    A = SparseArrays.sprand(Complex{Float64}, 10, 10, 0.3)
+
+    x = 10 * rand(Complex{Float64}, 10)
+
+    y = zeros(eltype(A), size(x))
+
+    merge_csr_mv!(1.1, A, x, y, adjoint)
+
+    @test (adjoint(Matrix(A)) * x) * 1.1 ≈ y
+end
+
+@testset "4x6 (Real)" begin
+    # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix
+    m = [10 20 0 0 0 0;
+         0 30 0 40 0 0;
+         0 0 50 60 70 0;
+         0 0 0 0 0 80]
+
+    # get into CSC form
+    A = sparse(m)
+
+    # create vector
+    x = [5,2,3,1]
+
+    # create empty solution
+    y = zeros(Int64, size(A, 2))
+    
+    # multiply
+    merge_csr_mv!(2.0, A, x, y, adjoint)
+
+
+    @test Matrix(adjoint(m)) * x * 2.0 == y
+end
+
+@testset "4 x 6 (Complex)" begin
+    # Taken from: https://en.wikipedia.org/wiki/Sparse_matrix
+    m = [10 20 0 0 0 0;
+    0 30 0 40 0 0;
+    0 0 50 60 70 0;
+    0 0 0 0 0 80]
+
+    # get into CSC form
+    A = sparse(m)
+
+    # create vector
+    x = 22.1 * rand(Complex{Float64}, 4)
+
+    # create empty solution
+    y = zeros(eltype(x), size(A, 2))
+
+    # multiply
+    merge_csr_mv!(2.0, A, x, y, adjoint)
+
+    @test Matrix(adjoint(m)) * x * 2.0 == y
+
+end
+
+@testset "100x100 (Real)" begin
+    # create matrix
+    A = SparseArrays.sprand(100, 100, 0.3)
+
+    # create vector
+    x = rand(100)
+
+    # create empty solution
+    y = zeros(size(A, 1))
+
+    merge_csr_mv!(3.0, A, x, y, transpose)
+
+    @test transpose(Matrix(A)) * x * 3 ≈ y
+end
+
+@testset "100x100 (Complex)" begin
+    # create matrix
+    A = SparseArrays.sprand(Complex{Float64}, 100, 100, 0.3)
+
+    # create vector
+    x = rand(Complex{Float64}, 100)
+
+    # create empty solution
+    y = zeros(eltype(x), size(A, 1))
+
+    merge_csr_mv!(3.0, A, x, y, transpose)
+
+    @test transpose(Matrix(A)) * x * 3 ≈ y
+end
+
+#=
+    NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul!
+    should be the preferred method to do Matrix-Matrix multiplication.
+=#
+@testset "Matrix-Matrix (Real)" begin
+
+    # Create Matrices
+    # 3 x 2 (treated as 2 x 3 in merge_csr_mv!)
+    A = sprand(3, 2, 0.4)
+    # 3 x 4
+    X = rand(3, 4)
+
+    α = 9.2
+
+    Y = zeros(2, 4)
+
+    for (idx, col) in enumerate(eachcol(X))
+        Y_view = @view Y[:, idx]
+        merge_csr_mv!(α, A, col, Y_view, transpose)
+    end
+
+    @test transpose(Matrix(A)) * X * 9.2 ≈ Y 
+
+end
+
+@testset "Matrix-Matrix (Real)" begin
+
+    # Create Matrices
+    # 3 x 2 (treated as 2 x 3 in merge_csr_mv!)
+    A = sprand(Complex{Float64}, 3, 2, 0.4)
+    # 3 x 4
+    X = rand(3, 4)
+
+    α = 9.2
+
+    Y = zeros(eltype(A), 2, 4)
+
+    for (idx, col) in enumerate(eachcol(X))
+        Y_view = @view Y[:, idx]
+        merge_csr_mv!(α, A, col, Y_view, adjoint)
+    end
+
+    @test adjoint(Matrix(A)) * X * 9.2 ≈ Y 
+
+end
\ No newline at end of file
diff --git a/test/mul!.jl b/test/mul!.jl
new file mode 100644
index 0000000..5b22b6a
--- /dev/null
+++ b/test/mul!.jl
@@ -0,0 +1,186 @@
+using Test
+using ParallelMergeCSR
+using SparseArrays
+
+# trigger merge_csr_mv! in this repo, does not default to mul! somewhere else
+@testset "Adjoint Real" begin
+
+    A = adjoint(sprand(5, 5, 0.3))    
+
+    B = rand(5, 5)
+    α = 11.2
+
+    C = rand(5, 5)
+    C_copy = deepcopy(C)
+    β = 3.9
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Adjoint Complex" begin
+    
+    # C = adjoint(A)Bα + Cβ
+    A = adjoint(sprand(Complex{Float64}, 5, 5, 0.3))
+    B = rand(5,5)
+    α = 2.3
+
+    C = rand(Complex{Float64}, 5, 5)
+    C_copy = deepcopy(C)
+    β = 1.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+# trigger merge_csr_mv!
+
+@testset "Transpose Square Real" begin
+
+    # C = transpose(A)Bα + Cβ
+    A = transpose(sparse(rand(1:5, 4, 4)))
+    B = rand(1:5, (4,4))
+    α = 1.0
+
+    C = zeros(4, 4)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    # right hand side is correct, left hand side is problematic
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+
+@testset "Transpose Square Complex" begin
+    # C = transpose(A)Bα + Cβ
+    A = transpose(sparse(rand(Complex{Float64}, 4, 4)))
+    B = rand(4,4)
+    α = 1.0
+
+    C = zeros(eltype(A), 4, 4)
+    C_copy = deepcopy(C)
+    β = 6.11 + 9.2im
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+end
+
+@testset "Transpose Rectangular Real" begin
+
+    A = sparse(rand(4, 2)) |> transpose
+    B = rand(4, 3)
+
+    α = 9.1
+
+    C = zeros(eltype(A), 2, 3)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Transpose Rectangular Complex" begin
+
+    A = sparse(rand(Complex{Float64}, 4, 2)) |> transpose
+    B = rand(4, 3)
+
+    α = 0.0 + 5.3im
+
+    C = zeros(eltype(A), 2, 3)
+    C_copy = deepcopy(C)
+    β = 1.0
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Matrix (Real)" begin
+
+    # A should be sparse, B should be dense
+    A = sprand(5, 5, 0.2)
+    B = rand(5, 5)
+    α = 1.1
+
+    C = zeros(5, 5)
+    C_copy = deepcopy(C)
+    β = 0.3
+
+    SparseArrays.mul!(C, A, B, α, β)
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Matrix (Complex)" begin
+
+    # A should be sparse, B should be dense
+    A = sprand(5, 5, 0.2)
+    B = rand(Complex{Float64}, 5, 5)
+    α = 1.1
+
+    C = zeros(eltype(B), 5, 5)
+    C_copy = deepcopy(C)
+    β = 0.3
+
+    SparseArrays.mul!(C, A, B, α, β)
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Vector (Real)" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(5)
+    α = 2.5
+
+    C = zeros(Float64, size(B))
+    C_copy = deepcopy(C)
+    β = 5.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+end
+
+@testset "Matrix x Vector (Real)" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(5)
+    α = 2.5
+
+    C = zeros(Float64, size(B))
+    C_copy = deepcopy(C)
+    β = 5.2
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
+
+@testset "Matrix x Vector (Complex)" begin
+
+    A = sprand(5, 5, 0.3)
+    B = rand(Complex{Float64}, 5)
+    α = 2.5
+
+    C = zeros(eltype(B), size(B))
+    C_copy = deepcopy(C)
+    β = 2.1+0.1im
+
+    SparseArrays.mul!(C, A, B, α, β)
+
+    @test C ≈ Matrix(A) * B * α + C_copy * β
+
+end
\ No newline at end of file
diff --git a/test/runtests.jl b/test/runtests.jl
index e69de29..ecfe12b 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -0,0 +1,13 @@
+using Test
+
+@testset "merge_csr_mv!" begin
+    include("merge_csr_mv!.jl")
+end
+
+@testset "atomic_add!" begin
+    include("atomic_add!.jl")
+end
+
+@testset "mul!" begin
+    include("mul!.jl")
+end
\ No newline at end of file
diff --git a/test/threaded.jl b/test/threaded.jl
deleted file mode 100644
index 08debd5..0000000
--- a/test/threaded.jl
+++ /dev/null
@@ -1,9 +0,0 @@
-using SparseArrays
-using Random
-
-
-
-
-
-
-
diff --git a/test/types.jl b/test/types.jl
deleted file mode 100644
index e69de29..0000000