diff --git a/Project.toml b/Project.toml
index 213b838d..1cabdfa6 100644
--- a/Project.toml
+++ b/Project.toml
@@ -3,7 +3,7 @@ uuid = "6f1fad26-d15e-5dc8-ae53-837a1d7b8c9f"
 license = "MIT"
 desc = "C shim for task copying in Turing"
 repo = "https://github.com/TuringLang/Libtask.jl.git"
-version = "0.5.1"
+version = "0.5.2"
 
 [deps]
 Libtask_jll = "3ae2931a-708c-5973-9c38-ccf7496fb450"
diff --git a/src/ctask.jl b/src/ctask.jl
index 06958d2b..8665c7b5 100644
--- a/src/ctask.jl
+++ b/src/ctask.jl
@@ -7,7 +7,10 @@ struct CTask
     task::Task
 
     function CTask(task::Task)
-        new(enable_stack_copying(task))
+        ret = new(enable_stack_copying(task))
+        task.storage === nothing && (task.storage = IdDict())
+        task.storage[:ctask] = ret
+        ret
     end
 end
 
@@ -24,12 +27,13 @@ end
 
 function Base.showerror(io::IO, ex::CTaskException)
     println(io, "CTaskException:")
+    ct = ex.task
     bt = @static if VERSION < v"1.6.0-DEV.1145"
         ct.backtrace
     else
         ct.storage[:_libtask_bt]
     end
-    showerror(io, ex.task.exception, bt)
+    showerror(io, ct.exception, bt)
 end
 
 # Utility function for self-copying mechanism
@@ -107,6 +111,7 @@ function Base.copy(ctask::CTask)
     setstate!(newtask, getstate(task))
     newtask.result = task.result
 
+    copy_tarrays(task, newtask)
     return CTask(newtask)
 end
 
diff --git a/src/tarray.jl b/src/tarray.jl
index 67df1f23..5212646f 100644
--- a/src/tarray.jl
+++ b/src/tarray.jl
@@ -7,7 +7,15 @@
 
 Implementation of data structures that automatically perform copy-on-write after task copying.
 
-If current_task is an existing key in `s`, then return `s[current_task]`. Otherwise, return `s[current_task] = s[last_task]`.
+For each `TArray` object, we have a task-local array object, i.e. if
+you access (read or write) the same `TArray` object in different
+tasks, you may be dealing with different array objects each belonging
+to a different task.  These task-specific array objects, however,
+share the same parent `TArray` object.
+
+More specifically, we store the underlying arrays in the field
+`TArray.data`, a dictionary whose keys are tasks, instead of storing
+them in the task local storage, for performance considerations.
 
 Usage:
 
@@ -23,10 +31,15 @@ for i in 1:4 ta[i] = i end  # assign
 Array(ta)                   # convert to 4-element Array{Int64,1}: [1, 2, 3, 4]
 ```
 """
-struct TArray{T,N} <: AbstractArray{T,N}
-    ref :: Symbol  # object_id
+struct TArray{T, N, A <: AbstractArray{T, N}} <: AbstractArray{T, N}
     orig_task :: Task
-    TArray{T,N}() where {T,N} = new(gensym(), current_task())
+    data::Dict{Task, Tuple{Int, A}}
+    function TArray{T, N, A}() where {T, N, A <: AbstractArray{T, N}}
+        d = Dict{Task, Tuple{Int, A}}()
+        res = new(current_task(), d)
+        register_to_keeper(res)
+        return res
+    end
 end
 
 TArray{T}(d::Integer...) where T = TArray(T, d)
@@ -37,15 +50,49 @@ TArray{T,N}(::UndefInitializer, d::Vararg{<:Integer,N}) where {T,N} = TArray{T,N
 TArray{T,N}(dim::NTuple{N,Int}) where {T,N} = TArray(T, dim)
 
 function TArray(T::Type, dim)
-    res = TArray{T,length(dim)}();
+    N_dim = length(dim)
+    res = TArray{T, N_dim, Array{T, N_dim}}()
     n = n_copies()
     d = Array{T}(undef, dim)
-    task_local_storage(res.ref, (n,d))
+    _set_local_storage(res, n, d)
     res
 end
 
 TArray(x::AbstractArray) = convert(TArray, x)
 
+# TArray House-Keeper
+
+const TArrayKeeper = Vector{WeakRef}()
+register_to_keeper(x::TArray) = push!(TArrayKeeper, WeakRef(x))
+function copy_tarrays(task1::Task, task2::Task)
+    filter!(x -> x.value !== nothing, TArrayKeeper)
+    for wref in TArrayKeeper
+        ta = wref.value
+        if haskey(ta.data, task1) && !haskey(ta.data, task2)
+            ta.data[task2] = ta.data[task1]
+        end
+    end
+end
+
+# _local_storage
+
+_get_local_storage(x) = x
+function _get_local_storage(x::TArray; copydata=false)
+    n, d = x.data[current_task()]
+    copydata || return d
+    cn   = n_copies()
+    newd = d
+    if cn > n
+        newd = deepcopy(d)
+        _set_local_storage(x, cn, newd)
+    end
+    return newd
+end
+
+function _set_local_storage(x::TArray{T, N}, n_copies::Int, d::AbstractArray{T, N}) where {T, N}
+    x.data[current_task()] = (n_copies, d)
+end
+
 localize(x) = x
 localize(x::AbstractArray) = TArray(x)
 
@@ -68,10 +115,10 @@ Array(tz)                   # convert to 4-element Array{Int64,1}: [0, 0, 0, 0]
 ```
 """
 function tzeros(T::Type, dim)
-    res = TArray{T,length(dim)}();
+    res = TArray{T,length(dim), Array{T, length(dim)}}();
     n = n_copies()
     d = zeros(T,dim)
-    task_local_storage(res.ref, (n,d))
+    _set_local_storage(res, n, d)
     return res
 end
 
@@ -96,37 +143,32 @@ Array(tz)                     # convert to 4-element Array{Float64,1}:  [9.0  9.
 ```
 """
 function tfill(val::Real, dim)
-    res = TArray{typeof(val),length(dim)}();
+    res = TArray{typeof(val), length(dim), Array{typeof(val), length(dim)}}();
     n = n_copies()
     d = fill(val,dim)
-    task_local_storage(res.ref, (n,d))
+    _set_local_storage(res, n, d)
     return res
 end
 
 #
 # Conversion between TArray and Array
 #
-_get(x) = x
-function _get(x::TArray)
-    n, d = task_local_storage(x.ref)
-    return d
-end
 
 function Base.convert(::Type{Array}, x::TArray)
     return convert(Array{eltype(x), ndims(x)}, x)
 end
-function Base.convert(::Type{Array{T,N}}, x::TArray{T,N}) where {T,N}
-    c = convert(Array{T, N}, deepcopy(_get(x)))
+function Base.convert(::Type{Array{T, N}}, x::TArray{T, N}) where {T, N}
+    c = convert(Array{T, N}, deepcopy(_get_local_storage(x)))
     return c
 end
 
 function Base.convert(::Type{TArray}, x::AbstractArray)
-    return convert(TArray{eltype(x),ndims(x)}, x)
+    return convert(TArray{eltype(x), ndims(x)}, x)
 end
-function Base.convert(::Type{TArray{T,N}}, x::AbstractArray{T,N}) where {T,N}
-    res = TArray{T,N}()
+function Base.convert(::Type{TArray{T, N}}, x::AbstractArray{T, N}) where {T, N}
+    res = TArray{T, N, typeof(x)}()
     n   = n_copies()
-    task_local_storage(res.ref, (n,x))
+    _set_local_storage(res, n, x)
     return res
 end
 
@@ -134,18 +176,18 @@ end
 # Representation
 #
 function Base.show(io::IO, ::MIME"text/plain", x::TArray)
-    arr = x.orig_task.storage[x.ref][2]
+    arr = x.data[x.orig_task][2]
     @warn "Here shows the originating task's storage, " *
         "not the current task's storage. " *
         "Please explicitly call show(::TArray) to display the current task's version of a TArray."
     show(io,  MIME("text/plain"), arr)
 end
 
-Base.show(io::IO, x::TArray) = Base.show(io::IO, task_local_storage(x.ref)[2])
+Base.show(io::IO, x::TArray) = show(io, _get_local_storage(x))
 
 function Base.summary(io::IO, x::TArray)
   print(io, "Task Local Array: ")
-  summary(io, _get(x))
+  summary(io, _get_local_storage(x))
 end
 
 #
@@ -154,19 +196,19 @@ end
 for F in (:size,
           :iterate,
           :firstindex, :lastindex, :axes)
-    @eval Base.$F(a::TArray, args...) = $F(_get(a), args...)
+    @eval Base.$F(a::TArray, args...) = $F(_get_local_storage(a), args...)
 end
 
 #
 # Similarity implementation
 #
 
-Base.similar(x::TArray, ::Type{T}, dims::Dims) where T = TArray(similar(_get(x), T, dims))
+Base.similar(x::TArray, ::Type{T}, dims::Dims) where T = TArray(similar(_get_local_storage(x), T, dims))
 
 for op in [:(==), :≈]
-    @eval Base.$op(x::TArray, y::AbstractArray) = Base.$op(_get(x), y)
-    @eval Base.$op(x::AbstractArray, y::TArray) = Base.$op(x, _get(y))
-    @eval Base.$op(x::TArray, y::TArray) = Base.$op(_get(x), _get(y))
+    @eval Base.$op(x::TArray, y::AbstractArray) = Base.$op(_get_local_storage(x), y)
+    @eval Base.$op(x::AbstractArray, y::TArray) = Base.$op(x, _get_local_storage(y))
+    @eval Base.$op(x::TArray, y::TArray) = Base.$op(_get_local_storage(x), _get_local_storage(y))
 end
 
 #
@@ -174,112 +216,92 @@ end
 #
 
 # Indexing Interface
-function Base.getindex(x::TArray{T, N}, I::Vararg{Int,N}) where {T, N}
-    t, d = task_local_storage(x.ref)
-    return d[I...]
+Base.@propagate_inbounds function Base.getindex(x::TArray{T, N}, I::Vararg{Int,N}) where {T, N}
+    return _get_local_storage(x)[I...]
 end
 
-function Base.setindex!(x::TArray{T, N}, e, I::Vararg{Int,N}) where {T, N}
-    n, d = task_local_storage(x.ref)
-    cn   = n_copies()
-    newd = d
-    if cn > n
-        # println("[setindex!]: $(x.ref) copying data")
-        newd = deepcopy(d)
-        task_local_storage(x.ref, (cn, newd))
-    end
-    newd[I...] = e
+Base.@propagate_inbounds function Base.setindex!(x::TArray{T, N}, e, I::Vararg{Int,N}) where {T, N}
+    d = _get_local_storage(x; copydata=true)
+    d[I...] = e
 end
 
-function Base.push!(x::TArray{T}, e) where T
-    n, d = task_local_storage(x.ref)
-    cn   = n_copies()
-    newd = d
-    if cn > n
-        newd = deepcopy(d)
-        task_local_storage(x.ref, (cn, newd))
-    end
-    push!(newd, e)
+function Base.push!(x::TArray, e)
+    d = _get_local_storage(x; copydata=true)
+    push!(d, e)
 end
 
 function Base.pop!(x::TArray)
-    n, d = task_local_storage(x.ref)
-    cn   = n_copies()
-    newd = d
-    if cn > n
-        newd = deepcopy(d)
-        task_local_storage(x.ref, (cn, newd))
-    end
+    d = _get_local_storage(x; copydata=true)
     pop!(d)
 end
 
 # Other methods from stdlib
 
 Base.view(x::TArray, inds...; kwargs...) =
-    Base.view(_get(x), inds...; kwargs...) |> localize
-Base.:-(x::TArray) = (- _get(x)) |> localize
-Base.transpose(x::TArray) = transpose(_get(x)) |> localize
-Base.adjoint(x::TArray) = adjoint(_get(x)) |> localize
-Base.repeat(x::TArray; kw...) = repeat(_get(x); kw...) |> localize
+    Base.view(_get_local_storage(x), inds...; kwargs...) |> localize
+Base.:-(x::TArray) = (- _get_local_storage(x)) |> localize
+Base.transpose(x::TArray) = transpose(_get_local_storage(x)) |> localize
+Base.adjoint(x::TArray) = adjoint(_get_local_storage(x)) |> localize
+Base.repeat(x::TArray; kw...) = repeat(_get_local_storage(x); kw...) |> localize
 
 Base.hcat(xs::Union{TArray{T,1}, TArray{T,2}}...) where T =
-    hcat(_get.(xs)...) |> localize
+    hcat(_get_local_storage.(xs)...) |> localize
 Base.vcat(xs::Union{TArray{T,1}, TArray{T,2}}...) where T =
-    vcat(_get.(xs)...) |> localize
+    vcat(_get_local_storage.(xs)...) |> localize
 Base.cat(xs::Union{TArray{T,1}, TArray{T,2}}...; dims) where T =
-    cat(_get.(xs)...; dims = dims) |> localize
+    cat(_get_local_storage.(xs)...; dims = dims) |> localize
 
 
-Base.reshape(x::TArray, dims::Union{Colon,Int}...) = reshape(_get(x), dims) |> localize
+Base.reshape(x::TArray, dims::Union{Colon,Int}...) = reshape(_get_local_storage(x), dims) |> localize
 Base.reshape(x::TArray, dims::Tuple{Vararg{Union{Int,Colon}}}) =
-    reshape(_get(x), Base._reshape_uncolon(_get(x), dims)) |> localize
-Base.reshape(x::TArray, dims::Tuple{Vararg{Int}}) = reshape(_get(x), dims) |> localize
-
-Base.permutedims(x::TArray, perm) = permutedims(_get(x), perm) |> localize
-Base.PermutedDimsArray(x::TArray, perm) = PermutedDimsArray(_get(x), perm) |> localize
-Base.reverse(x::TArray; dims) = reverse(_get(x), dims = dims) |> localize
-
-Base.sum(x::TArray; dims = :) = sum(_get(x), dims = dims) |> localize
-Base.sum(f::Union{Function,Type},x::TArray) = sum(f.(_get(x))) |> localize
-Base.prod(x::TArray; dims=:) = prod(_get(x); dims=dims) |> localize
-Base.prod(f::Union{Function, Type}, x::TArray) = prod(f.(_get(x))) |> localize
-
-Base.findfirst(x::TArray, args...) = findfirst(_get(x), args...) |> localize
-Base.maximum(x::TArray; dims = :) = maximum(_get(x), dims = dims) |> localize
-Base.minimum(x::TArray; dims = :) = minimum(_get(x), dims = dims) |> localize
-
-Base.:/(x::TArray, y::TArray) = _get(x) / _get(y) |> localize
-Base.:/(x::AbstractArray, y::TArray) = x / _get(y) |> localize
-Base.:/(x::TArray, y::AbstractArray) = _get(x) / y |> localize
-Base.:\(x::TArray, y::TArray) = _get(x) \ _get(y) |> localize
-Base.:\(x::AbstractArray, y::TArray) = x \ _get(y) |> localize
-Base.:\(x::TArray, y::AbstractArray) = _get(x) \ y |> localize
-Base.:*(x::TArray, y::TArray) = _get(x) * _get(y) |> localize
-Base.:*(x::AbstractArray, y::TArray) = x * _get(y) |> localize
-Base.:*(x::TArray, y::AbstractArray) = _get(x) * y |> localize
+    reshape(_get_local_storage(x), Base._reshape_uncolon(_get_local_storage(x), dims)) |> localize
+Base.reshape(x::TArray, dims::Tuple{Vararg{Int}}) = reshape(_get_local_storage(x), dims) |> localize
+
+Base.permutedims(x::TArray, perm) = permutedims(_get_local_storage(x), perm) |> localize
+Base.PermutedDimsArray(x::TArray, perm) = PermutedDimsArray(_get_local_storage(x), perm) |> localize
+Base.reverse(x::TArray; dims) = reverse(_get_local_storage(x), dims = dims) |> localize
+
+Base.sum(x::TArray; dims = :) = sum(_get_local_storage(x), dims = dims) |> localize
+Base.sum(f::Union{Function,Type},x::TArray) = sum(f.(_get_local_storage(x))) |> localize
+Base.prod(x::TArray; dims=:) = prod(_get_local_storage(x); dims=dims) |> localize
+Base.prod(f::Union{Function, Type}, x::TArray) = prod(f.(_get_local_storage(x))) |> localize
+
+Base.findfirst(x::TArray, args...) = findfirst(_get_local_storage(x), args...) |> localize
+Base.maximum(x::TArray; dims = :) = maximum(_get_local_storage(x), dims = dims) |> localize
+Base.minimum(x::TArray; dims = :) = minimum(_get_local_storage(x), dims = dims) |> localize
+
+Base.:/(x::TArray, y::TArray) = _get_local_storage(x) / _get_local_storage(y) |> localize
+Base.:/(x::AbstractArray, y::TArray) = x / _get_local_storage(y) |> localize
+Base.:/(x::TArray, y::AbstractArray) = _get_local_storage(x) / y |> localize
+Base.:\(x::TArray, y::TArray) = _get_local_storage(x) \ _get_local_storage(y) |> localize
+Base.:\(x::AbstractArray, y::TArray) = x \ _get_local_storage(y) |> localize
+Base.:\(x::TArray, y::AbstractArray) = _get_local_storage(x) \ y |> localize
+Base.:*(x::TArray, y::TArray) = _get_local_storage(x) * _get_local_storage(y) |> localize
+Base.:*(x::AbstractArray, y::TArray) = x * _get_local_storage(y) |> localize
+Base.:*(x::TArray, y::AbstractArray) = _get_local_storage(x) * y |> localize
 
 # broadcast
-Base.BroadcastStyle(::Type{TArray{T, N}}) where {T, N} = Broadcast.ArrayStyle{TArray}()
-Broadcast.broadcasted(::Broadcast.ArrayStyle{TArray}, f, args...) = f.(_get.(args)...) |> localize
+Base.BroadcastStyle(::Type{<:TArray}) = Broadcast.ArrayStyle{TArray}()
+Broadcast.broadcasted(::Broadcast.ArrayStyle{TArray}, f, args...) = f.(_get_local_storage.(args)...) |> localize
 
 import LinearAlgebra
 import LinearAlgebra:  \, /, inv, det, logdet, logabsdet, norm
 
-LinearAlgebra.inv(x::TArray) = inv(_get(x)) |> localize
-LinearAlgebra.det(x::TArray) = det(_get(x)) |> localize
-LinearAlgebra.logdet(x::TArray) = logdet(_get(x)) |> localize
-LinearAlgebra.logabsdet(x::TArray) = logabsdet(_get(x)) |> localize
+LinearAlgebra.inv(x::TArray) = inv(_get_local_storage(x)) |> localize
+LinearAlgebra.det(x::TArray) = det(_get_local_storage(x)) |> localize
+LinearAlgebra.logdet(x::TArray) = logdet(_get_local_storage(x)) |> localize
+LinearAlgebra.logabsdet(x::TArray) = logabsdet(_get_local_storage(x)) |> localize
 LinearAlgebra.norm(x::TArray, p::Real = 2) =
-    LinearAlgebra.norm(_get(x), p) |> localize
+    LinearAlgebra.norm(_get_local_storage(x), p) |> localize
 
 import LinearAlgebra: dot
-dot(x::TArray, ys::TArray) = dot(_get(x), _get(ys)) |> localize
-dot(x::AbstractArray, ys::TArray) = dot(x, _get(ys)) |> localize
-dot(x::TArray, ys::AbstractArray) = dot(_get(x), ys) |> localize
+dot(x::TArray, ys::TArray) = dot(_get_local_storage(x), _get_local_storage(ys)) |> localize
+dot(x::AbstractArray, ys::TArray) = dot(x, _get_local_storage(ys)) |> localize
+dot(x::TArray, ys::AbstractArray) = dot(_get_local_storage(x), ys) |> localize
 
 using Statistics
-Statistics.mean(x::TArray; dims = :) = mean(_get(x), dims = dims) |> localize
-Statistics.std(x::TArray; kw...) = std(_get(x), kw...) |> localize
+Statistics.mean(x::TArray; dims = :) = mean(_get_local_storage(x), dims = dims) |> localize
+Statistics.std(x::TArray; kw...) = std(_get_local_storage(x), kw...) |> localize
 
 # TODO
 # * NNlib
diff --git a/test/benchmarks.jl b/test/benchmarks.jl
index ea9fd025..25327808 100644
--- a/test/benchmarks.jl
+++ b/test/benchmarks.jl
@@ -1,21 +1,36 @@
 using BenchmarkTools
 using Libtask
 
+
+macro rep(cnt, exp)
+    blk =:(begin end)
+    for _ in 1:eval(cnt)
+        push!(blk.args, esc(exp))
+    end
+    blk
+end
+
+INTENSITY = 6
+
+indexing(a, x, y) = @rep INTENSITY a[x, y]
+setindexing(a, x, y) = @rep INTENSITY a[x, y] = 1
+broadcasting(a) = @rep INTENSITY a .+ a
+
 println("= Benchmarks on Arrays =")
-A = rand(100, 100)
-x, y =  abs.(rand(Int, 2) .% 100)
+A = rand(1000, 1000)
+x, y =  abs.(rand(Int, 2) .% 999) .+ 1
 print("indexing: ")
-@btime $A[$x, $y] + $A[$x, $y]
+@btime indexing($A, $x, $y)
 print("set indexing: ")
-@btime $A[$x, $y] = 1
+@btime setindexing($A, $x, $y)
 print("broadcast: ")
-@btime $A .+ $A
+@btime broadcasting($A)
 
 println("= Benchmarks on TArrays =")
 TA = Libtask.localize(deepcopy(A))
 print("indexing: ")
-@btime $TA[$x, $y] + $TA[$x, $y]
+@btime indexing($TA, $x, $y)
 print("set indexing: ")
-@btime $TA[$x, $y] = 1
+@btime setindexing($TA, $x, $y)
 print("broadcast: ")
-@btime $TA .+ $TA
+@btime broadcasting($TA)
diff --git a/test/tarray.jl b/test/tarray.jl
index 0ce037a6..9c20bd90 100644
--- a/test/tarray.jl
+++ b/test/tarray.jl
@@ -55,7 +55,7 @@
         @test_throws MethodError TArray{Int,2}(undef, 4)
 
         ta3 = TArray{Int, 4}(4, 3, 2, 1)
-        ta4 = Libtask._get(ta3)
+        ta4 = Libtask._get_local_storage(ta3)
         @test ta3[3] == ta4[3]
 
         ta5 = TArray{Int}(4)
@@ -116,7 +116,7 @@
 
         @test repeat(ta, 1, 2) == hcat(ta, ta)
 
-        @test ta .+ ta == Libtask._get(ta) .+ Libtask._get(ta)
+        @test ta .+ ta == Libtask._get_local_storage(ta) .+ Libtask._get_local_storage(ta)
     end
 
     @testset "task copy" begin
diff --git a/deps/methods_of_array.jl b/utils/methods_of_array.jl
similarity index 100%
rename from deps/methods_of_array.jl
rename to utils/methods_of_array.jl