From 0f4bc1a62d0e4cc9762168a067e46d815a5aae91 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 09:42:20 -0500
Subject: [PATCH 1/9] moving sources into separate files

---
 src/ParallelMergeCSR.jl | 178 +---------------------------------------
 src/parallel_csc_mv.jl  |  50 +++++++++++
 src/parallel_csr_mv.jl  | 127 ++++++++++++++++++++++++++++
 3 files changed, 180 insertions(+), 175 deletions(-)
 create mode 100644 src/parallel_csc_mv.jl
 create mode 100644 src/parallel_csr_mv.jl

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 1a74996..44c821f 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -8,180 +8,8 @@ using SparseArrays: AbstractSparseMatrixCSC,
                     DenseInputVecOrMat,
                     getcolptr
 
-mutable struct Coordinate
-    x::Int
-    y::Int
-end
 
-# Want to give the illusion that we're still using 0-based coordinates when the actual
-# values themselves are 1-based
-# 
-# diagonal -> i + j = k 
-# a_len and b_len can stay the same from the paper
-# 
-# a -> row-end offsets so really pass in a[2:end]
-# b -> "natural" numbers
-function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
-    # Diagonal search range (in x coordinate space)
-    x_min = max(diagonal - b_len, 0)
-    x_max = min(diagonal, a_len)
-    # 2D binary-search along diagonal search range
-    while (x_min < x_max)
-        pivot = (x_min + x_max) >> 1
-        if (a[pivot + 1] <= b[diagonal - pivot])
-            x_min = pivot + 1
-        else
-            x_max = pivot
-        end
-    end
+include("csrmv_merge.jl")
+include("parallel_csc_mv.jl")
 
-    return Coordinate(
-        min(x_min, a_len),
-        diagonal - x_min
-    )
-
-end
-
-# Internally treats the matrix as if its been transposed/is an adjoint
-# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
-function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
-
-    nzv = nonzeros(A)
-    rv = rowvals(A)
-    cp = getcolptr(A)
-
-    nnz = length(nzv) 
-    
-    # nrows = length(cp) - 1 can give the wrong number of rows!
-    nrows = A.n
-
-    nz_indices = collect(1:nnz)
-    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
-    num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
-
-    num_threads = nthreads()
-    items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
-
-    row_carry_out = zeros(eltype(cp), num_threads)
-    value_carry_out = zeros(eltype(output), num_threads) # value must match output
-
-    # Julia threads start id by 1, so make sure to offset!
-    @threads for tid in 1:num_threads
-        diagonal = min(items_per_thread * (tid - 1), num_merge_items)
-        diagonal_end = min(diagonal + items_per_thread, num_merge_items)
-
-        # Get starting and ending thread coordinates (row, nzv)
-        thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
-        thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
-
-        # Consume merge items, whole rows first
-        running_total = zero(eltype(output))
-        while thread_coord.x < thread_coord_end.x
-            @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
-                thread_coord.y += 1
-            end
-
-            @inbounds output[thread_coord.x + 1] += α * running_total
-            running_total = zero(eltype(output))
-            thread_coord.x += 1 
-        end
-       
-        # May have thread end up partially consuming a row.
-        # Save result form partial consumption and do one pass at the end to add it back to y
-        while thread_coord.y < thread_coord_end.y
-            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
-            thread_coord.y += 1
-        end
-
-        # Save carry-outs
-        @inbounds row_carry_out[tid] = thread_coord_end.x + 1
-        @inbounds value_carry_out[tid] = running_total
-
-    end
-
-    for tid in 1:num_threads
-        @inbounds if row_carry_out[tid] <= nrows
-            @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid]
-        end
-    end
-
-end
-
-
-# C = adjoint(A)Bα + Cβ
-# C = transpose(A)B + Cβ
-# C = xABα + Cβ
-for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
-    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
-        # obtains the original matrix underneath the "lazy wrapper"
-        A = xA.parent
-        size(A, 2) == size(C, 1) || throw(DimensionMismatch())
-        size(A, 1) == size(B, 1) || throw(DimensionMismatch())
-        size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        if β != 1
-            # preemptively handle β so we just handle C = ABα + C
-            β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
-        end
-        for (col_idx, input) in enumerate(eachcol(B))
-            output = @view C[:, col_idx]
-            merge_csr_mv!(α, A, input, output, $t)
-        end
-        C
-        # end of @eval macro
-    end
-    # end of for loop
-end
-
-
-## function names should be suffixed w/ exclamation mark
-function atomic_add!(a::AbstractVector{T},b::T) where T <: Real
-    # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes
-    # you'd have to wrap each element of the array with Atomic
-    # Objective: for each element in AbstractVector, atomically add b to the element
-    # Can also use Atomix.jl
-    for idx in 1:length(a)
-        Atomix.@atomic a[idx] += b
-    end
-    
-end
-
-function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
-    # return type representing real part
-    view_a = reinterpret(reshape, real(T),a)
-    real_b = real(b)
-    imag_b = imag(b)
-
-    # mutating the view also mutates the underlying array it came from
-    for idx in 1:size(view_a, 2)
-        Atomix.@atomic view_a[1, idx] += real_b
-        Atomix.@atomic view_a[2, idx] += imag_b
-    end
-end
-
-function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
-    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
-    size(A, 1) == size(C, 1) || throw(DimensionMismatch())
-    size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-    nzv = nonzeros(A)
-    rv = rowvals(A)
-    if β != 1
-        # preemptively handle β so we just handle C = ABα + C
-        β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
-    end
-    for k in 1:size(C, 2)
-        @threads for col in 1:size(A, 2)
-            @inbounds αxj = B[col,k] * α
-            for j in nzrange(A, col)
-                @inbounds val = nzv[j]*αxj
-                @inbounds row = rv[j]
-                @inbounds out = @view C[row:row, k]
-                atomic_add!(out,val) 
-            end
-        end
-    end
-    C
-end
-
-# end of the module
-end
\ No newline at end of file
+end # end of the module
diff --git a/src/parallel_csc_mv.jl b/src/parallel_csc_mv.jl
new file mode 100644
index 0000000..92ae8f5
--- /dev/null
+++ b/src/parallel_csc_mv.jl
@@ -0,0 +1,50 @@
+
+
+## function names should be suffixed w/ exclamation mark
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Real
+    # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes
+    # you'd have to wrap each element of the array with Atomic
+    # Objective: for each element in AbstractVector, atomically add b to the element
+    # Can also use Atomix.jl
+    for idx in 1:length(a)
+        Atomix.@atomic a[idx] += b
+    end
+    
+end
+
+function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex
+    # return type representing real part
+    view_a = reinterpret(reshape, real(T),a)
+    real_b = real(b)
+    imag_b = imag(b)
+
+    # mutating the view also mutates the underlying array it came from
+    for idx in 1:size(view_a, 2)
+        Atomix.@atomic view_a[1, idx] += real_b
+        Atomix.@atomic view_a[2, idx] += imag_b
+    end
+end
+
+function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number)
+    size(A, 2) == size(B, 1) || throw(DimensionMismatch())
+    size(A, 1) == size(C, 1) || throw(DimensionMismatch())
+    size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    if β != 1
+        # preemptively handle β so we just handle C = ABα + C
+        β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+    end
+    for k in 1:size(C, 2)
+        @threads for col in 1:size(A, 2)
+            @inbounds αxj = B[col,k] * α
+            for j in nzrange(A, col)
+                @inbounds val = nzv[j]*αxj
+                @inbounds row = rv[j]
+                @inbounds out = @view C[row:row, k]
+                atomic_add!(out,val) 
+            end
+        end
+    end
+    C
+end
\ No newline at end of file
diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
new file mode 100644
index 0000000..301d53f
--- /dev/null
+++ b/src/parallel_csr_mv.jl
@@ -0,0 +1,127 @@
+
+
+
+mutable struct Coordinate
+    x::Int
+    y::Int
+end
+
+# Want to give the illusion that we're still using 0-based coordinates when the actual
+# values themselves are 1-based
+# 
+# diagonal -> i + j = k 
+# a_len and b_len can stay the same from the paper
+# 
+# a -> row-end offsets so really pass in a[2:end]
+# b -> "natural" numbers
+function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
+    # Diagonal search range (in x coordinate space)
+    x_min = max(diagonal - b_len, 0)
+    x_max = min(diagonal, a_len)
+    # 2D binary-search along diagonal search range
+    while (x_min < x_max)
+        pivot = (x_min + x_max) >> 1
+        if (a[pivot + 1] <= b[diagonal - pivot])
+            x_min = pivot + 1
+        else
+            x_max = pivot
+        end
+    end
+
+    return Coordinate(
+        min(x_min, a_len),
+        diagonal - x_min
+    )
+
+end
+
+# Internally treats the matrix as if its been transposed/is an adjoint
+# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
+function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    cp = getcolptr(A)
+
+    nnz = length(nzv) 
+    
+    # nrows = length(cp) - 1 can give the wrong number of rows!
+    nrows = A.n
+
+    nz_indices = collect(1:nnz)
+    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+    num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
+
+    num_threads = nthreads()
+    items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
+
+    row_carry_out = zeros(eltype(cp), num_threads)
+    value_carry_out = zeros(eltype(output), num_threads) # value must match output
+
+    # Julia threads start id by 1, so make sure to offset!
+    @threads for tid in 1:num_threads
+        diagonal = min(items_per_thread * (tid - 1), num_merge_items)
+        diagonal_end = min(diagonal + items_per_thread, num_merge_items)
+
+        # Get starting and ending thread coordinates (row, nzv)
+        thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
+
+        # Consume merge items, whole rows first
+        running_total = zero(eltype(output))
+        while thread_coord.x < thread_coord_end.x
+            @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
+                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+                thread_coord.y += 1
+            end
+
+            @inbounds output[thread_coord.x + 1] += α * running_total
+            running_total = zero(eltype(output))
+            thread_coord.x += 1 
+        end
+       
+        # May have thread end up partially consuming a row.
+        # Save result form partial consumption and do one pass at the end to add it back to y
+        while thread_coord.y < thread_coord_end.y
+            @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            thread_coord.y += 1
+        end
+
+        # Save carry-outs
+        @inbounds row_carry_out[tid] = thread_coord_end.x + 1
+        @inbounds value_carry_out[tid] = running_total
+
+    end
+
+    for tid in 1:num_threads
+        @inbounds if row_carry_out[tid] <= nrows
+            @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid]
+        end
+    end
+
+end
+
+
+# C = adjoint(A)Bα + Cβ
+# C = transpose(A)B + Cβ
+# C = xABα + Cβ
+for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
+    @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+        # obtains the original matrix underneath the "lazy wrapper"
+        A = xA.parent
+        size(A, 2) == size(C, 1) || throw(DimensionMismatch())
+        size(A, 1) == size(B, 1) || throw(DimensionMismatch())
+        size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+        if β != 1
+            # preemptively handle β so we just handle C = ABα + C
+            β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+        end
+        for (col_idx, input) in enumerate(eachcol(B))
+            output = @view C[:, col_idx]
+            merge_csr_mv!(α, A, input, output, $t)
+        end
+        C
+        # end of @eval macro
+    end
+    # end of for loop
+end

From d5968896518a709f3b315c3057787c0dbb72028a Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 09:45:47 -0500
Subject: [PATCH 2/9] fixing file name

---
 src/ParallelMergeCSR.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl
index 44c821f..590916d 100644
--- a/src/ParallelMergeCSR.jl
+++ b/src/ParallelMergeCSR.jl
@@ -9,7 +9,7 @@ using SparseArrays: AbstractSparseMatrixCSC,
                     getcolptr
 
 
-include("csrmv_merge.jl")
+include("parallel_csr_mv.jl")
 include("parallel_csc_mv.jl")
 
 end # end of the module

From 42216ec6ee8c0e2a7f12e40b0cf259ad2bced635 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:18:32 -0500
Subject: [PATCH 3/9] replace collect(1:nnz)  with Range struct,indexing

---
 src/parallel_csr_mv.jl | 97 ++++++++++++++++++++++++++++++++++++++----
 1 file changed, 88 insertions(+), 9 deletions(-)

diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index 301d53f..e551928 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -1,5 +1,17 @@
 
 
+struct Range <: AbstractVector{Int}
+    start::Int
+    stop::Int
+end
+
+Base.length(range::Range) =  (range.stop-range.start)
+Base.size(range::Range) = (length(range),)
+
+function Base.getindex(range::Range,index::Int)
+    (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
+    return index + range.start - 1
+end
 
 mutable struct Coordinate
     x::Int
@@ -29,14 +41,12 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
     end
 
     return Coordinate(
-        min(x_min, a_len),
-        diagonal - x_min
+        min(x_min, a_len)+1,
+        (diagonal - x_min)+1
     )
 
 end
-
-# Internally treats the matrix as if its been transposed/is an adjoint
-# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
+#=
 function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
 
     nzv = nonzeros(A)
@@ -48,7 +58,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
     # nrows = length(cp) - 1 can give the wrong number of rows!
     nrows = A.n
 
-    nz_indices = collect(1:nnz)
+    nz_indices = Range(1,nnz)
     row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
     num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
 
@@ -70,8 +80,8 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
         # Consume merge items, whole rows first
         running_total = zero(eltype(output))
         while thread_coord.x < thread_coord_end.x
-            @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1
-                @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
+            @inbounds while thread_coord.y + 1 < row_end_offsets[thread_coord.x + 1] 
+                running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]]
                 thread_coord.y += 1
             end
 
@@ -102,6 +112,75 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
 end
 
 
+=#
+
+# Internally treats the matrix as if its been transposed/is an adjoint
+# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
+function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+
+    nzv = nonzeros(A)
+    rv = rowvals(A)
+    cp = getcolptr(A)
+
+    nnz = length(nzv) 
+    
+    # nrows = length(cp) - 1 can give the wrong number of rows!
+    nrows = A.n
+
+    nz_indices = Range(1,nnz)
+    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+    num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
+
+    num_threads = nthreads()
+    items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads
+
+    row_carry_out = zeros(eltype(cp), num_threads)
+    value_carry_out = zeros(eltype(output), num_threads) # value must match output
+
+    # Julia threads start id by 1, so make sure to offset!
+    @threads for tid in 1:num_threads
+        diagonal = min(items_per_thread * (tid - 1), num_merge_items)
+        diagonal_end = min(diagonal + items_per_thread, num_merge_items)
+
+        # Get starting and ending thread coordinates (row, nzv)
+        thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices)
+        thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices)
+
+        # Consume merge items, whole rows first
+        running_total = zero(eltype(output))
+        @inbounds while thread_coord.x < thread_coord_end.x
+            while thread_coord.y < row_end_offsets[thread_coord.x] 
+                running_total += op(nzv[thread_coord.y]) * input[rv[thread_coord.y]]
+                thread_coord.y += 1
+            end
+
+            output[thread_coord.x] += α * running_total
+            running_total = zero(eltype(output))
+            thread_coord.x += 1 
+        end
+       
+        # May have thread end up partially consuming a row.
+        # Save result form partial consumption and do one pass at the end to add it back to y
+        @inbounds while thread_coord.y < thread_coord_end.y
+            running_total += op(nzv[thread_coord.y]) * input[rv[thread_coord.y]]
+            thread_coord.y += 1
+        end
+
+        # Save carry-outs
+        @inbounds row_carry_out[tid] = thread_coord_end.x
+        @inbounds value_carry_out[tid] = running_total
+
+    end
+
+    @inbounds for tid in 1:num_threads
+        if row_carry_out[tid] <= nrows
+            output[row_carry_out[tid]] += α * value_carry_out[tid]
+        end
+    end
+
+end
+
+
 # C = adjoint(A)Bα + Cβ
 # C = transpose(A)B + Cβ
 # C = xABα + Cβ
@@ -124,4 +203,4 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
         # end of @eval macro
     end
     # end of for loop
-end
+end
\ No newline at end of file

From 18c342bf3fe543de676852516faf4c2db08ba193 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:36:00 -0500
Subject: [PATCH 4/9] ignoring .vscode

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 29126e4..4b2fb27 100644
--- a/.gitignore
+++ b/.gitignore
@@ -22,3 +22,4 @@ docs/site/
 # committed for packages, but should be committed for applications that require a static
 # environment.
 Manifest.toml
+.vscode/
\ No newline at end of file

From c2caf88b361b85349f030d4af5a508bb2405f1f0 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:36:50 -0500
Subject: [PATCH 5/9] adding Range tests and fixing interface with Range

---
 src/parallel_csr_mv.jl |  2 +-
 test/merge_csr_mv!.jl  | 18 ++++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index e551928..fa1b8dc 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -5,7 +5,7 @@ struct Range <: AbstractVector{Int}
     stop::Int
 end
 
-Base.length(range::Range) =  (range.stop-range.start)
+Base.length(range::Range) =  (range.stop-range.start) + 1 # includes both ends
 Base.size(range::Range) = (length(range),)
 
 function Base.getindex(range::Range,index::Int)
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 0eda52c..b7e7cd3 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -1,12 +1,26 @@
 using Test
-using ParallelMergeCSR: merge_csr_mv!
+using ParallelMergeCSR: merge_csr_mv!, Range
 using SparseArrays
 
+
+@testset "Range" begin
+    
+    range = Range(1,10)
+
+    @test length(range) == 10
+    @test size(range,1) == 10
+    @test size(range,2) == 1
+
+    for i in 1:10
+        @test range[i] == i
+    end
+
+end
+
 ## NOTE: Sparse matrices are converted to dense form in the @test's
 ##       considering that our redefinition of SparseArrays.mul! seems to
 ##       interfere
 @testset "Extreme Cases" begin
-
     @testset "Singleton (Real)" begin
         A = sparse(reshape([1], 1, 1))
 

From 9c045dde9e894c2cf1b80e16ffa50a3ac9a7cc23 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:42:32 -0500
Subject: [PATCH 6/9] subtracting 1 to reduce number of additions during
 indexing

---
 src/parallel_csr_mv.jl | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index 52e0950..b1183ee 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -3,14 +3,17 @@
 struct Range <: AbstractVector{Int}
     start::Int
     stop::Int
+    function Range(start::Int,stop::Int)
+        new(start-1,stop)
+    end
 end
 
-Base.length(range::Range) =  (range.stop-range.start) + 1 # includes both ends
+Base.length(range::Range) =  (range.stop-range.start) # includes both ends
 Base.size(range::Range) = (length(range),)
 
 function Base.getindex(range::Range,index::Int)
     (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
-    return index + range.start - 1
+    return index + range.start
 end
 
 

From 8e6e10aa5de582c6460510ce5d3536a24c54cca9 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:49:37 -0500
Subject: [PATCH 7/9] adding boundscheck

---
 src/parallel_csr_mv.jl | 4 ++--
 test/merge_csr_mv!.jl  | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index b1183ee..514c9fb 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -11,8 +11,8 @@ end
 Base.length(range::Range) =  (range.stop-range.start) # includes both ends
 Base.size(range::Range) = (length(range),)
 
-function Base.getindex(range::Range,index::Int)
-    (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
+@inline function Base.getindex(range::Range,index::Int)
+    @boundscheck (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
     return index + range.start
 end
 
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index b7e7cd3..9b651c3 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -15,6 +15,9 @@ using SparseArrays
         @test range[i] == i
     end
 
+    @test_throws BoundsError range[11]
+    @test @inbounds range[11] == 11 # checking if bound check is removed
+
 end
 
 ## NOTE: Sparse matrices are converted to dense form in the @test's

From cf24d8178920e796b1d0ed1e6dd86660a3bb3a54 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:50:31 -0500
Subject: [PATCH 8/9] removing bounds check for merge_path_search

---
 src/parallel_csr_mv.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index 514c9fb..e7122c4 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -30,12 +30,12 @@ end
 # 
 # a -> row-end offsets so really pass in a[2:end]
 # b -> "natural" numbers
-function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b)
+function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector)
     # Diagonal search range (in x coordinate space)
     x_min = max(diagonal - b_len, 0)
     x_max = min(diagonal, a_len)
     # 2D binary-search along diagonal search range
-    while (x_min < x_max)
+    @inbounds while (x_min < x_max)
         pivot = (x_min + x_max) >> 1
         if (a[pivot + 1] <= b[diagonal - pivot])
             x_min = pivot + 1

From fb018578f04979361abe29e8d2623c9d6e6f17f9 Mon Sep 17 00:00:00 2001
From: Phillip Weinberg <weinbe58@gmail.com>
Date: Thu, 5 Jan 2023 10:54:48 -0500
Subject: [PATCH 9/9] removing out of bounds check

---
 test/merge_csr_mv!.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index 9b651c3..5375e85 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -16,7 +16,6 @@ using SparseArrays
     end
 
     @test_throws BoundsError range[11]
-    @test @inbounds range[11] == 11 # checking if bound check is removed
 
 end