Skip to content
This repository was archived by the owner on May 4, 2019. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion src/DataArrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ module DataArrays
padNA,
pdata,
percent_change,
Perm,
PooledDataArray,
PooledDataMatrix,
PooledDataVecs,
Expand Down
17 changes: 12 additions & 5 deletions src/grouping.jl
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
function groupsort_indexer(x::AbstractVector, ngroups::Integer)
function groupsort_indexer(x::AbstractVector, ngroups::Integer, nalast::Bool=false)
# translated from Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx).

# count group sizes, location 0 for NA
Expand All @@ -11,10 +11,17 @@ function groupsort_indexer(x::AbstractVector, ngroups::Integer)

# mark the start of each contiguous group of like-indexed data
where = fill(1, ngroups + 1)
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
if nalast
for i = 3:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
where[1] = where[end] + counts[end]
else
for i = 2:ngroups+1
where[i] = where[i - 1] + counts[i - 1]
end
end

# this is our indexer
result = fill(0, n)
for i = 1:n
Expand All @@ -25,4 +32,4 @@ function groupsort_indexer(x::AbstractVector, ngroups::Integer)
result, where, counts
end

groupsort_indexer(pv::PooledDataVector) = groupsort_indexer(pv.refs, length(pv.pool))
groupsort_indexer(pv::PooledDataVector, nalast::Bool=false) = groupsort_indexer(pv.refs, length(pv.pool), nalast)
25 changes: 15 additions & 10 deletions src/pooleddataarray.jl
Original file line number Diff line number Diff line change
Expand Up @@ -773,19 +773,24 @@ end
##
##############################################################################

# TODO handle sortperm for non-sorted keys
Base.sortperm(pda::PooledDataArray) = groupsort_indexer(pda)[1]
function Base.sortperm(pda::PooledDataArray)
if issorted(pda.pool)
return groupsort_indexer(pda)[1]
else
return sortperm(reorder!(copy(pda)))
function Base.sortperm(pda::PooledDataArray; alg::Base.Sort.Algorithm=Base.Sort.DEFAULT_UNSTABLE,
lt::Function=isless, by::Function=identity,
rev::Bool=false, order=Base.Sort.Forward)
order = Base.ord(lt, by, rev, order)

# TODO handle custom ordering efficiently
if !isa(order, Base.Order.ForwardOrdering) && !isa(order, Base.Order.ReverseOrdering)
return sort!([1:length(pda)], alg, Base.Order.Perm(order,pda))
end

# TODO handle non-sorted keys without copying
perm = issorted(pda.pool) ? groupsort_indexer(pda, true)[1] : sortperm(reorder(pda))
isa(order, Base.Order.ReverseOrdering) && reverse!(perm)
perm
end

Base.sortperm(pda::PooledDataArray, ::Base.Sort.ReverseOrdering) = reverse(sortperm(pda))
Base.sort(pda::PooledDataArray) = pda[sortperm(pda)]
Base.sort(pda::PooledDataArray, ::Base.Sort.ReverseOrdering) = pda[reverse(sortperm(pda))]
Base.sort(pda::PooledDataArray; kw...) = pda[sortperm(pda; kw...)]

type FastPerm{O<:Base.Sort.Ordering,V<:AbstractVector} <: Base.Sort.Ordering
ord::O
vec::V
Expand Down
24 changes: 18 additions & 6 deletions test/sort.jl
Original file line number Diff line number Diff line change
@@ -1,17 +1,29 @@
module TestSort
using DataArrays, Base.Test

dv1 = @data([9, 1, 8, NA, 3, 3, 7, NA])
dv2 = 1.0 * dv1
dv3 = DataArray([1:8])
pdv1 = convert(PooledDataArray, dv1)

@test sortperm(dv1) == sortperm(dv2)
@test sortperm(dv1) == sortperm(pdv1)
@test isequal(sort(dv1), convert(DataArray, sort(dv1)))
@test isequal(sort(dv1), convert(DataArray, sort(pdv1)))

for T in (Float64, BigFloat)
n = 1000
na = randbool(n)
nna = sum(na)
a = Array(T, n)
ra = randn(n-nna)
a[!na] = ra
da = DataArray(a, na)
@test isequal(sort(da), [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(da[sortperm(da)], [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(sort(da, rev=true), [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
@test isequal(da[sortperm(da, rev=true)], [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
for da in (DataArray(a, na), PooledDataArray(a, na), (pda = PooledDataArray(a, na); setlevels!(pda, shuffle!(pda.pool))))
@test isequal(sort(da), [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(sort(da; lt=(x,y)->isless(x,y)), [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(da[sortperm(da)], [DataArray(sort(dropna(da))), DataArray(T, nna)])
@test isequal(sort(da, rev=true), [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
@test isequal(da[sortperm(da, rev=true)], [DataArray(T, nna), DataArray(sort(dropna(da), rev=true))])
end
end
end
end