From 0f4bc1a62d0e4cc9762168a067e46d815a5aae91 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 09:42:20 -0500 Subject: [PATCH 1/9] moving sources into separate files --- src/ParallelMergeCSR.jl | 178 +--------------------------------------- src/parallel_csc_mv.jl | 50 +++++++++++ src/parallel_csr_mv.jl | 127 ++++++++++++++++++++++++++++ 3 files changed, 180 insertions(+), 175 deletions(-) create mode 100644 src/parallel_csc_mv.jl create mode 100644 src/parallel_csr_mv.jl diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 1a74996..44c821f 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -8,180 +8,8 @@ using SparseArrays: AbstractSparseMatrixCSC, DenseInputVecOrMat, getcolptr -mutable struct Coordinate - x::Int - y::Int -end -# Want to give the illusion that we're still using 0-based coordinates when the actual -# values themselves are 1-based -# -# diagonal -> i + j = k -# a_len and b_len can stay the same from the paper -# -# a -> row-end offsets so really pass in a[2:end] -# b -> "natural" numbers -function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) - # Diagonal search range (in x coordinate space) - x_min = max(diagonal - b_len, 0) - x_max = min(diagonal, a_len) - # 2D binary-search along diagonal search range - while (x_min < x_max) - pivot = (x_min + x_max) >> 1 - if (a[pivot + 1] <= b[diagonal - pivot]) - x_min = pivot + 1 - else - x_max = pivot - end - end +include("csrmv_merge.jl") +include("parallel_csc_mv.jl") - return Coordinate( - min(x_min, a_len), - diagonal - x_min - ) - -end - -# Internally treats the matrix as if its been transposed/is an adjoint -# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op -function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) - - nzv = nonzeros(A) - rv = rowvals(A) - cp = getcolptr(A) - - nnz = length(nzv) - - # nrows = length(cp) - 1 can give the wrong number of rows! - nrows = A.n - - nz_indices = collect(1:nnz) - row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats - num_merge_items = nnz + nrows # preserve the dimensions of the original matrix - - num_threads = nthreads() - items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads - - row_carry_out = zeros(eltype(cp), num_threads) - value_carry_out = zeros(eltype(output), num_threads) # value must match output - - # Julia threads start id by 1, so make sure to offset! - @threads for tid in 1:num_threads - diagonal = min(items_per_thread * (tid - 1), num_merge_items) - diagonal_end = min(diagonal + items_per_thread, num_merge_items) - - # Get starting and ending thread coordinates (row, nzv) - thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) - thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) - - # Consume merge items, whole rows first - running_total = zero(eltype(output)) - while thread_coord.x < thread_coord_end.x - @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] - thread_coord.y += 1 - end - - @inbounds output[thread_coord.x + 1] += α * running_total - running_total = zero(eltype(output)) - thread_coord.x += 1 - end - - # May have thread end up partially consuming a row. - # Save result form partial consumption and do one pass at the end to add it back to y - while thread_coord.y < thread_coord_end.y - @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] - thread_coord.y += 1 - end - - # Save carry-outs - @inbounds row_carry_out[tid] = thread_coord_end.x + 1 - @inbounds value_carry_out[tid] = running_total - - end - - for tid in 1:num_threads - @inbounds if row_carry_out[tid] <= nrows - @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid] - end - end - -end - - -# C = adjoint(A)Bα + Cβ -# C = transpose(A)B + Cβ -# C = xABα + Cβ -for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) - @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) - # obtains the original matrix underneath the "lazy wrapper" - A = xA.parent - size(A, 2) == size(C, 1) || throw(DimensionMismatch()) - size(A, 1) == size(B, 1) || throw(DimensionMismatch()) - size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - if β != 1 - # preemptively handle β so we just handle C = ABα + C - β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) - end - for (col_idx, input) in enumerate(eachcol(B)) - output = @view C[:, col_idx] - merge_csr_mv!(α, A, input, output, $t) - end - C - # end of @eval macro - end - # end of for loop -end - - -## function names should be suffixed w/ exclamation mark -function atomic_add!(a::AbstractVector{T},b::T) where T <: Real - # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes - # you'd have to wrap each element of the array with Atomic - # Objective: for each element in AbstractVector, atomically add b to the element - # Can also use Atomix.jl - for idx in 1:length(a) - Atomix.@atomic a[idx] += b - end - -end - -function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex - # return type representing real part - view_a = reinterpret(reshape, real(T),a) - real_b = real(b) - imag_b = imag(b) - - # mutating the view also mutates the underlying array it came from - for idx in 1:size(view_a, 2) - Atomix.@atomic view_a[1, idx] += real_b - Atomix.@atomic view_a[2, idx] += imag_b - end -end - -function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) - size(A, 2) == size(B, 1) || throw(DimensionMismatch()) - size(A, 1) == size(C, 1) || throw(DimensionMismatch()) - size(B, 2) == size(C, 2) || throw(DimensionMismatch()) - nzv = nonzeros(A) - rv = rowvals(A) - if β != 1 - # preemptively handle β so we just handle C = ABα + C - β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) - end - for k in 1:size(C, 2) - @threads for col in 1:size(A, 2) - @inbounds αxj = B[col,k] * α - for j in nzrange(A, col) - @inbounds val = nzv[j]*αxj - @inbounds row = rv[j] - @inbounds out = @view C[row:row, k] - atomic_add!(out,val) - end - end - end - C -end - -# end of the module -end \ No newline at end of file +end # end of the module diff --git a/src/parallel_csc_mv.jl b/src/parallel_csc_mv.jl new file mode 100644 index 0000000..92ae8f5 --- /dev/null +++ b/src/parallel_csc_mv.jl @@ -0,0 +1,50 @@ + + +## function names should be suffixed w/ exclamation mark +function atomic_add!(a::AbstractVector{T},b::T) where T <: Real + # There's an atomic_add! in Threads but has signature (x::Atomic{T}, val::T) where T <: ArithmeticTypes + # you'd have to wrap each element of the array with Atomic + # Objective: for each element in AbstractVector, atomically add b to the element + # Can also use Atomix.jl + for idx in 1:length(a) + Atomix.@atomic a[idx] += b + end + +end + +function atomic_add!(a::AbstractVector{T},b::T) where T <: Complex + # return type representing real part + view_a = reinterpret(reshape, real(T),a) + real_b = real(b) + imag_b = imag(b) + + # mutating the view also mutates the underlying array it came from + for idx in 1:size(view_a, 2) + Atomix.@atomic view_a[1, idx] += real_b + Atomix.@atomic view_a[2, idx] += imag_b + end +end + +function SparseArrays.mul!(C::StridedVecOrMat, A::AbstractSparseMatrixCSC, B::DenseInputVecOrMat, α::Number, β::Number) + size(A, 2) == size(B, 1) || throw(DimensionMismatch()) + size(A, 1) == size(C, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + nzv = nonzeros(A) + rv = rowvals(A) + if β != 1 + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for k in 1:size(C, 2) + @threads for col in 1:size(A, 2) + @inbounds αxj = B[col,k] * α + for j in nzrange(A, col) + @inbounds val = nzv[j]*αxj + @inbounds row = rv[j] + @inbounds out = @view C[row:row, k] + atomic_add!(out,val) + end + end + end + C +end \ No newline at end of file diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl new file mode 100644 index 0000000..301d53f --- /dev/null +++ b/src/parallel_csr_mv.jl @@ -0,0 +1,127 @@ + + + +mutable struct Coordinate + x::Int + y::Int +end + +# Want to give the illusion that we're still using 0-based coordinates when the actual +# values themselves are 1-based +# +# diagonal -> i + j = k +# a_len and b_len can stay the same from the paper +# +# a -> row-end offsets so really pass in a[2:end] +# b -> "natural" numbers +function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) + # Diagonal search range (in x coordinate space) + x_min = max(diagonal - b_len, 0) + x_max = min(diagonal, a_len) + # 2D binary-search along diagonal search range + while (x_min < x_max) + pivot = (x_min + x_max) >> 1 + if (a[pivot + 1] <= b[diagonal - pivot]) + x_min = pivot + 1 + else + x_max = pivot + end + end + + return Coordinate( + min(x_min, a_len), + diagonal - x_min + ) + +end + +# Internally treats the matrix as if its been transposed/is an adjoint +# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op +function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) + + nzv = nonzeros(A) + rv = rowvals(A) + cp = getcolptr(A) + + nnz = length(nzv) + + # nrows = length(cp) - 1 can give the wrong number of rows! + nrows = A.n + + nz_indices = collect(1:nnz) + row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats + num_merge_items = nnz + nrows # preserve the dimensions of the original matrix + + num_threads = nthreads() + items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads + + row_carry_out = zeros(eltype(cp), num_threads) + value_carry_out = zeros(eltype(output), num_threads) # value must match output + + # Julia threads start id by 1, so make sure to offset! + @threads for tid in 1:num_threads + diagonal = min(items_per_thread * (tid - 1), num_merge_items) + diagonal_end = min(diagonal + items_per_thread, num_merge_items) + + # Get starting and ending thread coordinates (row, nzv) + thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) + + # Consume merge items, whole rows first + running_total = zero(eltype(output)) + while thread_coord.x < thread_coord_end.x + @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + thread_coord.y += 1 + end + + @inbounds output[thread_coord.x + 1] += α * running_total + running_total = zero(eltype(output)) + thread_coord.x += 1 + end + + # May have thread end up partially consuming a row. + # Save result form partial consumption and do one pass at the end to add it back to y + while thread_coord.y < thread_coord_end.y + @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + thread_coord.y += 1 + end + + # Save carry-outs + @inbounds row_carry_out[tid] = thread_coord_end.x + 1 + @inbounds value_carry_out[tid] = running_total + + end + + for tid in 1:num_threads + @inbounds if row_carry_out[tid] <= nrows + @inbounds output[row_carry_out[tid]] += α * value_carry_out[tid] + end + end + +end + + +# C = adjoint(A)Bα + Cβ +# C = transpose(A)B + Cβ +# C = xABα + Cβ +for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) + @eval function SparseArrays.mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number) + # obtains the original matrix underneath the "lazy wrapper" + A = xA.parent + size(A, 2) == size(C, 1) || throw(DimensionMismatch()) + size(A, 1) == size(B, 1) || throw(DimensionMismatch()) + size(B, 2) == size(C, 2) || throw(DimensionMismatch()) + if β != 1 + # preemptively handle β so we just handle C = ABα + C + β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C))) + end + for (col_idx, input) in enumerate(eachcol(B)) + output = @view C[:, col_idx] + merge_csr_mv!(α, A, input, output, $t) + end + C + # end of @eval macro + end + # end of for loop +end From d5968896518a709f3b315c3057787c0dbb72028a Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 09:45:47 -0500 Subject: [PATCH 2/9] fixing file name --- src/ParallelMergeCSR.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ParallelMergeCSR.jl b/src/ParallelMergeCSR.jl index 44c821f..590916d 100644 --- a/src/ParallelMergeCSR.jl +++ b/src/ParallelMergeCSR.jl @@ -9,7 +9,7 @@ using SparseArrays: AbstractSparseMatrixCSC, getcolptr -include("csrmv_merge.jl") +include("parallel_csr_mv.jl") include("parallel_csc_mv.jl") end # end of the module From 42216ec6ee8c0e2a7f12e40b0cf259ad2bced635 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:18:32 -0500 Subject: [PATCH 3/9] replace collect(1:nnz) with Range struct,indexing --- src/parallel_csr_mv.jl | 97 ++++++++++++++++++++++++++++++++++++++---- 1 file changed, 88 insertions(+), 9 deletions(-) diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl index 301d53f..e551928 100644 --- a/src/parallel_csr_mv.jl +++ b/src/parallel_csr_mv.jl @@ -1,5 +1,17 @@ +struct Range <: AbstractVector{Int} + start::Int + stop::Int +end + +Base.length(range::Range) = (range.stop-range.start) +Base.size(range::Range) = (length(range),) + +function Base.getindex(range::Range,index::Int) + (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]")) + return index + range.start - 1 +end mutable struct Coordinate x::Int @@ -29,14 +41,12 @@ function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) end return Coordinate( - min(x_min, a_len), - diagonal - x_min + min(x_min, a_len)+1, + (diagonal - x_min)+1 ) end - -# Internally treats the matrix as if its been transposed/is an adjoint -# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op +#= function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) nzv = nonzeros(A) @@ -48,7 +58,7 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # nrows = length(cp) - 1 can give the wrong number of rows! nrows = A.n - nz_indices = collect(1:nnz) + nz_indices = Range(1,nnz) row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats num_merge_items = nnz + nrows # preserve the dimensions of the original matrix @@ -70,8 +80,8 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect # Consume merge items, whole rows first running_total = zero(eltype(output)) while thread_coord.x < thread_coord_end.x - @inbounds while thread_coord.y < row_end_offsets[thread_coord.x + 1] - 1 - @inbounds running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] + @inbounds while thread_coord.y + 1 < row_end_offsets[thread_coord.x + 1] + running_total += op(nzv[thread_coord.y + 1]) * input[rv[thread_coord.y + 1]] thread_coord.y += 1 end @@ -102,6 +112,75 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect end +=# + +# Internally treats the matrix as if its been transposed/is an adjoint +# e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op +function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op) + + nzv = nonzeros(A) + rv = rowvals(A) + cp = getcolptr(A) + + nnz = length(nzv) + + # nrows = length(cp) - 1 can give the wrong number of rows! + nrows = A.n + + nz_indices = Range(1,nnz) + row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats + num_merge_items = nnz + nrows # preserve the dimensions of the original matrix + + num_threads = nthreads() + items_per_thread = (num_merge_items + num_threads - 1) ÷ num_threads + + row_carry_out = zeros(eltype(cp), num_threads) + value_carry_out = zeros(eltype(output), num_threads) # value must match output + + # Julia threads start id by 1, so make sure to offset! + @threads for tid in 1:num_threads + diagonal = min(items_per_thread * (tid - 1), num_merge_items) + diagonal_end = min(diagonal + items_per_thread, num_merge_items) + + # Get starting and ending thread coordinates (row, nzv) + thread_coord = merge_path_search(diagonal, nrows, nnz, row_end_offsets, nz_indices) + thread_coord_end = merge_path_search(diagonal_end, nrows, nnz, row_end_offsets, nz_indices) + + # Consume merge items, whole rows first + running_total = zero(eltype(output)) + @inbounds while thread_coord.x < thread_coord_end.x + while thread_coord.y < row_end_offsets[thread_coord.x] + running_total += op(nzv[thread_coord.y]) * input[rv[thread_coord.y]] + thread_coord.y += 1 + end + + output[thread_coord.x] += α * running_total + running_total = zero(eltype(output)) + thread_coord.x += 1 + end + + # May have thread end up partially consuming a row. + # Save result form partial consumption and do one pass at the end to add it back to y + @inbounds while thread_coord.y < thread_coord_end.y + running_total += op(nzv[thread_coord.y]) * input[rv[thread_coord.y]] + thread_coord.y += 1 + end + + # Save carry-outs + @inbounds row_carry_out[tid] = thread_coord_end.x + @inbounds value_carry_out[tid] = running_total + + end + + @inbounds for tid in 1:num_threads + if row_carry_out[tid] <= nrows + output[row_carry_out[tid]] += α * value_carry_out[tid] + end + end + +end + + # C = adjoint(A)Bα + Cβ # C = transpose(A)B + Cβ # C = xABα + Cβ @@ -124,4 +203,4 @@ for (T, t) in ((Adjoint, adjoint), (Transpose, transpose)) # end of @eval macro end # end of for loop -end +end \ No newline at end of file From 18c342bf3fe543de676852516faf4c2db08ba193 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:36:00 -0500 Subject: [PATCH 4/9] ignoring .vscode --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 29126e4..4b2fb27 100644 --- a/.gitignore +++ b/.gitignore @@ -22,3 +22,4 @@ docs/site/ # committed for packages, but should be committed for applications that require a static # environment. Manifest.toml +.vscode/ \ No newline at end of file From c2caf88b361b85349f030d4af5a508bb2405f1f0 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:36:50 -0500 Subject: [PATCH 5/9] adding Range tests and fixing interface with Range --- src/parallel_csr_mv.jl | 2 +- test/merge_csr_mv!.jl | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl index e551928..fa1b8dc 100644 --- a/src/parallel_csr_mv.jl +++ b/src/parallel_csr_mv.jl @@ -5,7 +5,7 @@ struct Range <: AbstractVector{Int} stop::Int end -Base.length(range::Range) = (range.stop-range.start) +Base.length(range::Range) = (range.stop-range.start) + 1 # includes both ends Base.size(range::Range) = (length(range),) function Base.getindex(range::Range,index::Int) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 0eda52c..b7e7cd3 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -1,12 +1,26 @@ using Test -using ParallelMergeCSR: merge_csr_mv! +using ParallelMergeCSR: merge_csr_mv!, Range using SparseArrays + +@testset "Range" begin + + range = Range(1,10) + + @test length(range) == 10 + @test size(range,1) == 10 + @test size(range,2) == 1 + + for i in 1:10 + @test range[i] == i + end + +end + ## NOTE: Sparse matrices are converted to dense form in the @test's ## considering that our redefinition of SparseArrays.mul! seems to ## interfere @testset "Extreme Cases" begin - @testset "Singleton (Real)" begin A = sparse(reshape([1], 1, 1)) From 9c045dde9e894c2cf1b80e16ffa50a3ac9a7cc23 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:42:32 -0500 Subject: [PATCH 6/9] subtracting 1 to reduce number of additions during indexing --- src/parallel_csr_mv.jl | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl index 52e0950..b1183ee 100644 --- a/src/parallel_csr_mv.jl +++ b/src/parallel_csr_mv.jl @@ -3,14 +3,17 @@ struct Range <: AbstractVector{Int} start::Int stop::Int + function Range(start::Int,stop::Int) + new(start-1,stop) + end end -Base.length(range::Range) = (range.stop-range.start) + 1 # includes both ends +Base.length(range::Range) = (range.stop-range.start) # includes both ends Base.size(range::Range) = (length(range),) function Base.getindex(range::Range,index::Int) (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]")) - return index + range.start - 1 + return index + range.start end From 8e6e10aa5de582c6460510ce5d3536a24c54cca9 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:49:37 -0500 Subject: [PATCH 7/9] adding boundscheck --- src/parallel_csr_mv.jl | 4 ++-- test/merge_csr_mv!.jl | 3 +++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl index b1183ee..514c9fb 100644 --- a/src/parallel_csr_mv.jl +++ b/src/parallel_csr_mv.jl @@ -11,8 +11,8 @@ end Base.length(range::Range) = (range.stop-range.start) # includes both ends Base.size(range::Range) = (length(range),) -function Base.getindex(range::Range,index::Int) - (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]")) +@inline function Base.getindex(range::Range,index::Int) + @boundscheck (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]")) return index + range.start end diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index b7e7cd3..9b651c3 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -15,6 +15,9 @@ using SparseArrays @test range[i] == i end + @test_throws BoundsError range[11] + @test @inbounds range[11] == 11 # checking if bound check is removed + end ## NOTE: Sparse matrices are converted to dense form in the @test's From cf24d8178920e796b1d0ed1e6dd86660a3bb3a54 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:50:31 -0500 Subject: [PATCH 8/9] removing bounds check for merge_path_search --- src/parallel_csr_mv.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl index 514c9fb..e7122c4 100644 --- a/src/parallel_csr_mv.jl +++ b/src/parallel_csr_mv.jl @@ -30,12 +30,12 @@ end # # a -> row-end offsets so really pass in a[2:end] # b -> "natural" numbers -function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a, b) +function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector) # Diagonal search range (in x coordinate space) x_min = max(diagonal - b_len, 0) x_max = min(diagonal, a_len) # 2D binary-search along diagonal search range - while (x_min < x_max) + @inbounds while (x_min < x_max) pivot = (x_min + x_max) >> 1 if (a[pivot + 1] <= b[diagonal - pivot]) x_min = pivot + 1 From fb018578f04979361abe29e8d2623c9d6e6f17f9 Mon Sep 17 00:00:00 2001 From: Phillip Weinberg Date: Thu, 5 Jan 2023 10:54:48 -0500 Subject: [PATCH 9/9] removing out of bounds check --- test/merge_csr_mv!.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl index 9b651c3..5375e85 100644 --- a/test/merge_csr_mv!.jl +++ b/test/merge_csr_mv!.jl @@ -16,7 +16,6 @@ using SparseArrays end @test_throws BoundsError range[11] - @test @inbounds range[11] == 11 # checking if bound check is removed end