diff --git a/README.md b/README.md
index f3806db..10280a6 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,60 @@
 # ParallelMergeCSR.jl
-A Sparse Matrix library implemented Merge-based Parallel Sparse Matrix-Vector Multiplication.
 
-see [this paper](https://rd.yyrcd.com/CUDA/2022-03-14-Merge-based%20Parallel%20Sparse%20Matrix-Vector%20Multiplication.pdf)
+[![Build Status](https://github.com/QuEraComputing/Bloqade.jl/workflows/CI/badge.svg)](https://github.com/QuEraComputing/Bloqade.jl/actions)
+
+<p>
+An implementation/port of <a href="ttps://rd.yyrcd.com/CUDA/2022-03-14-Merge-based%20Parallel%20Sparse%20Matrix-Vector%20Multiplication.pdf">Merrill and Garland's Merge-based Parallel Sparse Matrix-Vector Multiplication (10.1109/SC.2016.57)</a> paper in 
+the &nbsp;
+    <a href="https://julialang.org">
+        <img src="https://raw.githubusercontent.com/JuliaLang/julia-logo-graphics/master/images/julia.ico" width="16em">
+        Julia Programming Language
+    </a>
+    &nbsp;
+</p>
+
+ParallelMergeCSR allows you to perform *multithreaded* [CSC formatted sparse Matrix](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_column_.28CSC_or_CCS.29) multiplication against dense vectors and matrices as long as the sparse Matrix has had a **transpose** or **adjoint** operation applied to it via `LinearAlgebra`, built-in to Julia Base. The reason for this is the original algorithm was restricted to [CSR formatted sparse Matrices](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format)) but by taking the transpose of a CSC matrix you've created a CSR representation of the same matrix.
+
+ParallelMergeCSR only exports one function: `mul!` which is used for both Sparse Matrix - Dense Vector and Sparse Matrix - Dense Matrix multiplication.
+
+## Installation
+
+Enter the Julia Package Manager and install via:
+
+```
+pkg> add ParallelMergeCSR
+```
+
+## Usage
+
+Start Julia with the desired number of threads by launching it with the `-t`/`--threads` argument:
+```
+julia --threads <number_of_threads>
+```
+or setting `JULIA_NUM_THREADS` in your environment and then running Julia.
+
+You can confirm Julia is using the specified number of threads via:
+
+```julia
+julia> Threads.nthreads()
+```
+
+You can then use `ParallelMergeCSR` in a similar fashion to the example below:
+
+```julia
+julia> using ParallelMergeCSR, SparseArrays
+
+# create a 20x20 transposed CSC-formatted Sparse matrix with a 30% chance of values appearing
+julia> A = transpose(sprand(20, 20, 0.3));
+
+# dense vector (can be a matrix too)
+julia> B = rand(size(A, 2));
+
+# output
+julia> C = rand(size(A, 2));
+
+# coefficients
+julia> α = -1.0; β = 2.9;
+
+# perform the operation C = ABα + Cβ, mutating C to store the answer
+julia> ParallelMergeCSR.mul!(C, A, B, α, β)
+```
diff --git a/src/parallel_csr_mv.jl b/src/parallel_csr_mv.jl
index c2662ea..84bfdac 100644
--- a/src/parallel_csr_mv.jl
+++ b/src/parallel_csr_mv.jl
@@ -1,5 +1,12 @@
+"""
+    Range <: AbstractVector{Int}
 
+Functionally equivalent to `CountingInputIterator<int>` from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
 
+Acts as a vector of integers from `start` to `stop` and used to represent indices to the non-zero values of the sparse matrix.
+
+See also: merge_csr_mv!
+"""
 struct Range <: AbstractVector{Int}
     start::Int
     stop::Int
@@ -8,29 +15,65 @@ struct Range <: AbstractVector{Int}
     end
 end
 
+"""
+    Base.length(range::Range)
+
+Gets the length of a range as the number of integers represented from its `start` and `stop`
+inclusive on both ends.
+"""
 Base.length(range::Range) =  (range.stop-range.start) # includes both ends
+
+"""
+    Base.size(range::Range)
+
+Gets the size of a range, which is equal to its length.
+
+See also: `Base.length(range::Range)`
+"""
 Base.size(range::Range) = (length(range),)
 
+"""
+    Base.getindex(range::Range,index::Int)
+
+Index into a `Range` using `index` with an exception thrown if the index falls out of bounds
+of the range.
+"""
 @inline function Base.getindex(range::Range,index::Int)
     @boundscheck (0 < index ≤ length(range)) || throw(BoundsError("attempting to access the $(range.stop-range.start)-element Range at index [$(index)]"))
     return index + range.start
 end
 
+"""
+    mutable struct Coordinate
 
+An implementation of the C++ `CoordinateT` struct from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Holds the "coordinate" (a combination of a row offset value and an index into the non-zero values of 
+generated by `merge_path_search`.
+
+See also: `merge_path_search`
+"""
 mutable struct Coordinate
     x::Int
     y::Int
 end
 
-# Want to give the illusion that we're still using 0-based coordinates when the actual
-# values themselves are 1-based
-# 
-# diagonal -> i + j = k 
-# a_len and b_len can stay the same from the paper
-# 
-# a -> row-end offsets so really pass in a[2:end]
-# b -> "natural" numbers
+
+"""
+    merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector)
+
+An implementation of the C++ `MergePathSearch` function from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Given a `diagonal` the grid formed by the row offsets `a` and the indices of the non-zero values `b` 
+as well as their respective lengths `a_len` and `b_len`, find the coordinate on the grid where a thread could 
+begin/end at for its workload.
+"""
 function merge_path_search(diagonal::Int, a_len::Int, b_len::Int, a::AbstractVector, b::AbstractVector)
+
+    # diagonal -> i + j = k 
+    # a_len and b_len can stay the same from the paper
+    # a are row-end offsets so really pass in a[2:end]
+    # b are "natural" numbers (need to start at 1 instead of paper's 0 due to Julia 1-indexing)
     # Diagonal search range (in x coordinate space)
     x_min = max(diagonal - b_len, 0)
     x_max = min(diagonal, a_len)
@@ -53,7 +96,18 @@ end
 
 # Internally treats the matrix as if its been transposed/is an adjoint
 # e.g. if you pass in a 2x3, internally it's a 3x2 and can be an adjoint if you pass in `adjoint` as the op
-function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+"""
+    merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
+
+An implementation of the C++ `OmpMergeCsrmv` function from the Merrill and Garland paper (https://doi.org/10.1109/SC.2016.57)
+
+Performs the operation A * input * α + output = output.
+
+Note that while A is a CSC-formatted sparse matrix, internally the function treats it as a CSR matrix (e.g. if you passed 
+in a 2x4, internally it's a 4x2). The `op` argument is used to handle the case where the the CSC matrix should be treated 
+as if it's had the adjoint applied to it.
+"""
+function merge_csr_mv!(α::Number, A::AbstractSparseMatrixCSC, input::StridedVector, output::StridedVector, op)
 
     nzv = nonzeros(A)
     rv = rowvals(A)
@@ -61,11 +115,10 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
 
     nnz = length(nzv) 
     
-    # nrows = length(cp) - 1 can give the wrong number of rows!
     nrows = A.n
 
     nz_indices = Range(1,nnz)
-    row_end_offsets = cp[2:end] # nzval ordering is diff for diff formats
+    row_end_offsets = cp[2:end] 
     num_merge_items = nnz + nrows # preserve the dimensions of the original matrix
 
     num_threads = nthreads()
@@ -117,27 +170,31 @@ function merge_csr_mv!(α::Number,A::AbstractSparseMatrixCSC, input::StridedVect
 
 end
 
-# C = adjoint(A)Bα + Cβ
-# C = transpose(A)B + Cβ
-# C = xABα + Cβ
 for (T, t) in ((Adjoint, adjoint), (Transpose, transpose))
-    @eval function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
-        # obtains the original matrix underneath the "lazy wrapper"
-        A = xA.parent
-        size(A, 2) == size(C, 1) || throw(DimensionMismatch())
-        size(A, 1) == size(B, 1) || throw(DimensionMismatch())
-        size(B, 2) == size(C, 2) || throw(DimensionMismatch())
-        if β != 1
-            # preemptively handle β so we just handle C = ABα + C
-            β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
-        end
-        for (col_idx, input) in enumerate(eachcol(B))
-            output = @view C[:, col_idx]
-            merge_csr_mv!(α, A, input, output, $t)
+    @eval begin
+    """
+        mul!(C::StridedVecOrMat, xA::$($T){<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+
+    Performs the operation C = xABα + Cβ where xA is an `AbstractSparseMatrixCSC` that has been wrapped by `$($T)`
+    (accomplishable by calling `$($t)` on the CSC matrix).
+    """
+        function mul!(C::StridedVecOrMat, xA::$T{<:Any,<:AbstractSparseMatrixCSC}, B::DenseInputVecOrMat, α::Number, β::Number)
+            # obtains the original matrix underneath the "lazy wrapper"
+            A = xA.parent
+            size(A, 2) == size(C, 1) || throw(DimensionMismatch())
+            size(A, 1) == size(B, 1) || throw(DimensionMismatch())
+            size(B, 2) == size(C, 2) || throw(DimensionMismatch())
+            if β != 1
+                # preemptively handle β so we just handle C = ABα + C
+                β != 0 ? SparseArrays.rmul!(C, β) : fill!(C, zero(eltype(C)))
+            end
+            for (col_idx, input) in enumerate(eachcol(B))
+                output = @view C[:, col_idx]
+                merge_csr_mv!(α, A, input, output, $t)
+            end
+            C
+            # end of @eval macro
         end
-        C
-        # end of @eval macro
-    end
     # end of for loop
-
+    end
 end
\ No newline at end of file
diff --git a/test/merge_csr_mv!.jl b/test/merge_csr_mv!.jl
index d7beca7..3f4ef08 100644
--- a/test/merge_csr_mv!.jl
+++ b/test/merge_csr_mv!.jl
@@ -19,9 +19,6 @@ using SparseArrays: sprand, sparse
 
 end
 
-## NOTE: Sparse matrices are converted to dense form in the @test's
-##       considering that our redefinition of SparseArrays.mul! seems to
-##       interfere
 @testset "Extreme Cases" begin
     @testset "Singleton (Real)" begin
         A = sparse(reshape([1], 1, 1))
@@ -90,7 +87,6 @@ end
     =#
 end
 
-# test fails, y seems to have lots of zero-entries
 @testset "Square (Real)" begin
     A = sprand(10,10,0.3)
 
@@ -195,7 +191,7 @@ end
 end
 
 #=
-    NOTE: While merge_csr_mv! can be used this way, the overriden SparseArrays.mul!
+    NOTE: While merge_csr_mv! can be used this way, the mul!
     should be the preferred method to do Matrix-Matrix multiplication.
 =#
 @testset "Matrix-Matrix (Real)" begin