From cffe69ffb885356df61687436d313f6ec25e612d Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Mon, 12 Sep 2016 13:17:10 +0200 Subject: [PATCH 1/8] vcat of PooledDataVector might need to expand the reftype --- src/pooleddataarray.jl | 42 ++++++++++++++++++++++++++++++++--------- test/pooleddataarray.jl | 10 ++++++++++ 2 files changed, 43 insertions(+), 9 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 9576950..fc21d48 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -143,6 +143,9 @@ for (f, basef) in ((:pdatazeros, :zeros), (:pdataones, :ones)) end end +# Pooled reference type +reftype{T,R}(pa::PooledDataArray{T,R}) = R + ############################################################################## ## ## Basic size properties of all Data* objects @@ -251,13 +254,15 @@ end ## ############################################################################## -function compact{T,R<:Integer,N}(d::PooledDataArray{T,R,N}) - sz = length(d.pool) - +function compactreftype(sz) REFTYPE = sz <= typemax(UInt8) ? UInt8 : sz <= typemax(UInt16) ? UInt16 : sz <= typemax(UInt32) ? UInt32 : UInt64 +end + +function compact{T,R<:Integer,N}(d::PooledDataArray{T,R,N}) + REFTYPE = compactreftype(length(d.pool)) if REFTYPE == R return d @@ -618,12 +623,7 @@ Perm{O<:Base.Sort.Ordering}(o::O, v::PooledDataVector) = FastPerm(o, v) function PooledDataVecs{S,Q<:Integer,R<:Integer,N}(v1::PooledDataArray{S,Q,N}, v2::PooledDataArray{S,R,N}) pool = sort(unique([v1.pool; v2.pool])) - sz = length(pool) - - REFTYPE = sz <= typemax(UInt8) ? UInt8 : - sz <= typemax(UInt16) ? UInt16 : - sz <= typemax(UInt32) ? UInt32 : - UInt64 + REFTYPE = compactreftype(length(pool)) tidx1 = convert(Vector{REFTYPE}, findat(pool, v1.pool)) tidx2 = convert(Vector{REFTYPE}, findat(pool, v2.pool)) @@ -829,3 +829,27 @@ function dropna{T}(pdv::PooledDataVector{T}) resize!(res, total) return res end + +function Base.vcat(pa::PooledDataArray...) + N = length(size(pa[1])) + for p in pa + @assert length(size(p))==N + end + + pools = [p.pool for p in pa] + pool = levels([pools...;]) + + # grow the reftype as much as needed + # unless one of the reftypes in 'pa' was big enough + ref_sz = [typemax(reftype(p)) for p in pa] + sz = maximum([length(pool), ref_sz...]) + REFTYPE = compactreftype(sz) + + idx = map(pa) do p + m = findat(pool, p.pool) + m[p.refs] + end + + refs = Array{REFTYPE,N}([idx...;]) + PooledDataArray(RefArray(refs), pool) +end diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index cedab66..dc57d91 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -109,4 +109,14 @@ module TestPDA pda = @pdata([NA, "A", "B", "C", "A", "B"]) @test isequal(Base.permute!!(copy(pda), [2, 5, 3, 6, 4, 1]), @pdata(["A", "A", "B", "B", "C", NA])) @test isequal(Base.ipermute!!(copy(pda), [6, 1, 3, 5, 2, 4]), @pdata(["A", "A", "B", "B", "C", NA])) + + a1 = 1:200 + a2 = 100:300 + pa1 = PooledDataArray(a1); + pa2 = PooledDataArray(a2); + ca1 = compact(pa1); + ca2 = compact(pa2); + @test vcat(ca1, ca2) == vcat(a1, a2) + @test vcat(ca1, ca2) |> DataArrays.reftype == UInt16 + @test vcat(ca1, pa2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE end From 6a7fa47fafb119d511dd3e906bc014cf5479b680 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Mon, 12 Sep 2016 21:58:53 +0200 Subject: [PATCH 2/8] type-stable vcat of pooled arrays --- src/pooleddataarray.jl | 25 +++++++++++-------------- test/pooleddataarray.jl | 2 +- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index fc21d48..fd2ba35 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -830,26 +830,23 @@ function dropna{T}(pdv::PooledDataVector{T}) return res end -function Base.vcat(pa::PooledDataArray...) - N = length(size(pa[1])) - for p in pa - @assert length(size(p))==N +function Base.vcat{T,R,N}(p1::PooledDataArray{T,R,N}, p2::PooledDataArray...) + dim{T2,R2,N2}(p::PooledDataArray{T2,R2,N2}) = N + + for p in p2 + @assert dim(p)==N end - pools = [p.pool for p in pa] - pool = levels([pools...;]) + pa = PooledDataArray[p1, p2...] - # grow the reftype as much as needed - # unless one of the reftypes in 'pa' was big enough - ref_sz = [typemax(reftype(p)) for p in pa] - sz = maximum([length(pool), ref_sz...]) - REFTYPE = compactreftype(sz) + pools = Vector{T}[p.pool for p in pa] + pool = levels(T[pools...;]) - idx = map(pa) do p + idx = [ begin m = findat(pool, p.pool) m[p.refs] - end + end for p in pa] - refs = Array{REFTYPE,N}([idx...;]) + refs = Array{DEFAULT_POOLED_REF_TYPE,N}([idx...;]) PooledDataArray(RefArray(refs), pool) end diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index dc57d91..ec6290e 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -117,6 +117,6 @@ module TestPDA ca1 = compact(pa1); ca2 = compact(pa2); @test vcat(ca1, ca2) == vcat(a1, a2) - @test vcat(ca1, ca2) |> DataArrays.reftype == UInt16 + @test vcat(ca1, ca2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE @test vcat(ca1, pa2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE end From 41f1040a7f51ae090a1db567f650e3aee5dd8b8e Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 15 Sep 2016 23:36:15 +0200 Subject: [PATCH 3/8] vcat: test multi-dimensional arrays --- src/pooleddataarray.jl | 1 + test/pooleddataarray.jl | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index fd2ba35..634bbc3 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -835,6 +835,7 @@ function Base.vcat{T,R,N}(p1::PooledDataArray{T,R,N}, p2::PooledDataArray...) for p in p2 @assert dim(p)==N + @assert size(p)[2:end] == size(p1)[2:end] end pa = PooledDataArray[p1, p2...] diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index ec6290e..0d6b642 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -119,4 +119,12 @@ module TestPDA @test vcat(ca1, ca2) == vcat(a1, a2) @test vcat(ca1, ca2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE @test vcat(ca1, pa2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE + + a1 = zeros(2,3,4,5) + a2 = zeros(3,3,4,5) + a1[1:end] = 1:length(a1) + a2[1:end] = (1:length(a2)) + length(a1) + ca1 = PooledDataArray(a1) |> compact; + ca2 = PooledDataArray(a2) |> compact; + @test vcat(ca1, ca2) == vcat(a1, a2) end From 76f749b18f12df827052a95a70ad8b06b413e371 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Thu, 15 Sep 2016 23:36:33 +0200 Subject: [PATCH 4/8] vcat: replace dim with ndims --- src/pooleddataarray.jl | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 634bbc3..11e418d 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -831,10 +831,8 @@ function dropna{T}(pdv::PooledDataVector{T}) end function Base.vcat{T,R,N}(p1::PooledDataArray{T,R,N}, p2::PooledDataArray...) - dim{T2,R2,N2}(p::PooledDataArray{T2,R2,N2}) = N - for p in p2 - @assert dim(p)==N + @assert ndims(p) == N @assert size(p)[2:end] == size(p1)[2:end] end From cd07b873c6a019de901d17142de6e3d27f7acb06 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Fri, 16 Sep 2016 14:34:40 +0200 Subject: [PATCH 5/8] improved tests --- test/pooleddataarray.jl | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index 0d6b642..442bf1e 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -112,19 +112,22 @@ module TestPDA a1 = 1:200 a2 = 100:300 - pa1 = PooledDataArray(a1); - pa2 = PooledDataArray(a2); - ca1 = compact(pa1); - ca2 = compact(pa2); - @test vcat(ca1, ca2) == vcat(a1, a2) - @test vcat(ca1, ca2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE - @test vcat(ca1, pa2) |> DataArrays.reftype == DataArrays.DEFAULT_POOLED_REF_TYPE - - a1 = zeros(2,3,4,5) - a2 = zeros(3,3,4,5) - a1[1:end] = 1:length(a1) - a2[1:end] = (1:length(a2)) + length(a1) - ca1 = PooledDataArray(a1) |> compact; - ca2 = PooledDataArray(a2) |> compact; - @test vcat(ca1, ca2) == vcat(a1, a2) + pa1 = PooledDataArray(a1) + pa2 = PooledDataArray(a2) + ca1 = compact(pa1) + ca2 = compact(pa2) + r = vcat(ca1, ca2) + @test r == vcat(a1, a2) + @test isa(r, PooledDataArray{Int,DataArrays.DEFAULT_POOLED_REF_TYPE}) + @test isa(vcat(ca1, pa2), PooledDataArray{Int,DataArrays.DEFAULT_POOLED_REF_TYPE}) + + a1 = Array{Int64}(2,3,4,5) + a2 = Array{Int64}(3,3,4,5) + a1[1:end] = length(a1):-1:1 + a2[1:end] = (1:length(a2)) + 10 + ca1 = compact(PooledDataArray(a1)) + ca2 = compact(PooledDataArray(a2)) + r = vcat(ca1, ca2) + @test r == vcat(a1, a2) + @test isa(r, PooledDataArray{Int,DataArrays.DEFAULT_POOLED_REF_TYPE,4}) end From ece7e37bc05d5758ee15b727bb156b7823b2309c Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Fri, 16 Sep 2016 14:34:50 +0200 Subject: [PATCH 6/8] using simpler functions --- src/pooleddataarray.jl | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 11e418d..9a07ade 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -143,9 +143,6 @@ for (f, basef) in ((:pdatazeros, :zeros), (:pdataones, :ones)) end end -# Pooled reference type -reftype{T,R}(pa::PooledDataArray{T,R}) = R - ############################################################################## ## ## Basic size properties of all Data* objects @@ -831,18 +828,11 @@ function dropna{T}(pdv::PooledDataVector{T}) end function Base.vcat{T,R,N}(p1::PooledDataArray{T,R,N}, p2::PooledDataArray...) - for p in p2 - @assert ndims(p) == N - @assert size(p)[2:end] == size(p1)[2:end] - end - - pa = PooledDataArray[p1, p2...] - - pools = Vector{T}[p.pool for p in pa] - pool = levels(T[pools...;]) + pa = (p1, p2...) + pool = unique(T[[p.pool for p in pa]...;]) - idx = [ begin - m = findat(pool, p.pool) + idx = [begin + m = indexin(p.pool, pool) m[p.refs] end for p in pa] From 5f43d988935c6ecdfa43c760c336183b0c064b07 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Sat, 17 Sep 2016 11:32:18 +0200 Subject: [PATCH 7/8] Use platform-default Int instead of Int64 --- test/pooleddataarray.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/pooleddataarray.jl b/test/pooleddataarray.jl index 442bf1e..ce99b49 100644 --- a/test/pooleddataarray.jl +++ b/test/pooleddataarray.jl @@ -121,8 +121,8 @@ module TestPDA @test isa(r, PooledDataArray{Int,DataArrays.DEFAULT_POOLED_REF_TYPE}) @test isa(vcat(ca1, pa2), PooledDataArray{Int,DataArrays.DEFAULT_POOLED_REF_TYPE}) - a1 = Array{Int64}(2,3,4,5) - a2 = Array{Int64}(3,3,4,5) + a1 = Array{Int}(2,3,4,5) + a2 = Array{Int}(3,3,4,5) a1[1:end] = length(a1):-1:1 a2[1:end] = (1:length(a2)) + 10 ca1 = compact(PooledDataArray(a1)) From 39fef7ec282674a4e009613f755a7bafa00d69e1 Mon Sep 17 00:00:00 2001 From: Johan Gustafsson Date: Sat, 17 Sep 2016 11:33:31 +0200 Subject: [PATCH 8/8] compact code --- src/pooleddataarray.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/pooleddataarray.jl b/src/pooleddataarray.jl index 9a07ade..a646862 100644 --- a/src/pooleddataarray.jl +++ b/src/pooleddataarray.jl @@ -831,10 +831,7 @@ function Base.vcat{T,R,N}(p1::PooledDataArray{T,R,N}, p2::PooledDataArray...) pa = (p1, p2...) pool = unique(T[[p.pool for p in pa]...;]) - idx = [begin - m = indexin(p.pool, pool) - m[p.refs] - end for p in pa] + idx = [indexin(p.pool, pool)[p.refs] for p in pa] refs = Array{DEFAULT_POOLED_REF_TYPE,N}([idx...;]) PooledDataArray(RefArray(refs), pool)