Skip to content

Allow skipping of sites and mutations in sort #1826

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Oct 22, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -6297,10 +6297,11 @@ test_sort_tables_offsets(void)
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, &copy, 0));

/* Individual bookmark ignored */
tsk_memset(&bookmark, 0, sizeof(bookmark));
bookmark.individuals = tables.individuals.num_rows - 1;
ret = tsk_table_collection_sort(&tables, &bookmark, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED);
CU_ASSERT_EQUAL_FATAL(ret, 0);

tsk_table_collection_free(&tables);
tsk_table_collection_free(&copy);
Expand Down Expand Up @@ -6409,8 +6410,8 @@ test_sort_tables_errors(void)
tsk_table_collection_t tables;
tsk_bookmark_t pos;

tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL,
NULL, NULL, NULL, 0);
tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL,
single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0);
ret = tsk_treeseq_copy_tables(&ts, &tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

Expand Down Expand Up @@ -6457,8 +6458,7 @@ test_sort_tables_errors(void)
ret = tsk_table_collection_sort(&tables, &pos, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

/* Setting sites or mutations gives a BAD_PARAM. See
* github.com/tskit-dev/tskit/issues/101 */
/* Specifying only one of sites or mutations is an error */
tsk_memset(&pos, 0, sizeof(pos));
pos.sites = 1;
ret = tsk_table_collection_sort(&tables, &pos, 0);
Expand Down
8 changes: 0 additions & 8 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -6348,14 +6348,6 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start)
ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED;
goto out;
}

/* Individuals also must be all sorted or skipped entirely */
if (start->individuals == self->tables->individuals.num_rows) {
skip_individuals = true;
} else if (start->individuals != 0) {
ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED;
goto out;
}
}
/* The indexes will be invalidated, so drop them */
ret = tsk_table_collection_drop_index(self->tables, 0);
Expand Down
5 changes: 4 additions & 1 deletion python/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,11 @@

**Features**

- Allow skipping of site and mutation tables in ``TableCollection.sort``
(:user:`benjeffery`, :issue:`1475`, :pr:`1826`).

- Add ``TableCollection.sort_individuals`` to sort the individuals as this is no longer done by the
default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`)
default sort (:user:`benjeffery`, :issue:`1774`, :pr:`1789`).

- Add ``__setitem__`` to all tables allowing single rows to be updated. For example
``tables.nodes[0] = tables.nodes[0].replace(flags=tskit.NODE_IS_SAMPLE)``
Expand Down
11 changes: 8 additions & 3 deletions python/_tskitmodule.c
Original file line number Diff line number Diff line change
Expand Up @@ -6611,17 +6611,22 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds)
int err;
PyObject *ret = NULL;
Py_ssize_t edge_start = 0;
Py_ssize_t site_start = 0;
Py_ssize_t mutation_start = 0;
tsk_bookmark_t start;
static char *kwlist[] = { "edge_start", NULL };
static char *kwlist[] = { "edge_start", "site_start", "mutation_start", NULL };

if (TableCollection_check_state(self) != 0) {
goto out;
}
if (!PyArg_ParseTupleAndKeywords(args, kwds, "|n", kwlist, &edge_start)) {
if (!PyArg_ParseTupleAndKeywords(
args, kwds, "|nnn", kwlist, &edge_start, &site_start, &mutation_start)) {
goto out;
}
memset(&start, 0, sizeof(start));
start.edges = edge_start;
start.edges = (tsk_size_t) edge_start;
start.sites = (tsk_size_t) site_start;
start.mutations = (tsk_size_t) mutation_start;
err = tsk_table_collection_sort(self->tables, &start, 0);
if (err != 0) {
handle_library_error(err);
Expand Down
33 changes: 24 additions & 9 deletions python/tests/test_tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -2216,7 +2216,7 @@ def verify_sort(self, tables, seed):
self.verify_sort_mutation_consistency(tables, seed)
self.verify_randomise_tables(tables, seed)

def verify_edge_sort_offset(self, ts):
def verify_sort_offset(self, ts):
"""
Verifies the behaviour of the edge_start offset value.
"""
Expand All @@ -2225,7 +2225,6 @@ def verify_edge_sort_offset(self, ts):
starts = [0]
if len(edges) > 2:
starts = [0, 1, len(edges) // 2, len(edges) - 2]
random.seed(self.random_seed)
for start in starts:
# Unsort the edges starting from index start
all_edges = list(ts.edges())
Expand All @@ -2235,7 +2234,7 @@ def verify_edge_sort_offset(self, ts):
tables.edges.clear()
for e in all_edges:
tables.edges.append(e)
# Verify that import fails for randomised edges
# Verify that import fails for reversed edges
with pytest.raises(_tskit.LibraryError):
tables.tree_sequence()
# If we sort after the start value we should still fail.
Expand All @@ -2250,6 +2249,22 @@ def verify_edge_sort_offset(self, ts):
# Verify the new and old edges are equal.
assert edges == tables.edges

tables.tree_sequence()
if len(tables.mutations) > 2:
mutations = tables.mutations.copy()
tables.mutations.clear()
for m in mutations[::-1]:
tables.mutations.append(m)
with pytest.raises(_tskit.LibraryError):
tables.tree_sequence()
tables.sort(
0, site_start=len(tables.sites), mutation_start=len(tables.mutations)
)
with pytest.raises(_tskit.LibraryError):
tables.tree_sequence()
tables.sort(0)
tables.tree_sequence()

def get_wf_example(self, seed):
tables = wf.wf_sim(
6,
Expand Down Expand Up @@ -2277,7 +2292,7 @@ def test_wf_example(self):

def test_single_tree_no_mutations(self):
ts = msprime.simulate(10, random_seed=self.random_seed)
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 432)

def test_single_tree_no_mutations_metadata(self):
Expand All @@ -2288,13 +2303,13 @@ def test_single_tree_no_mutations_metadata(self):
def test_many_trees_no_mutations(self):
ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed)
assert ts.num_trees > 2
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 31)

def test_single_tree_mutations(self):
ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed)
assert ts.num_sites > 2
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 83)

def test_single_tree_mutations_metadata(self):
Expand All @@ -2320,7 +2335,7 @@ def test_many_trees_mutations(self):
)
assert ts.num_trees > 2
assert ts.num_sites > 2
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 173)

def test_many_trees_multichar_mutations(self):
Expand Down Expand Up @@ -2358,14 +2373,14 @@ def get_nonbinary_example(self, mutation_rate):

def test_nonbinary_trees(self):
ts = self.get_nonbinary_example(mutation_rate=0)
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 9182)

def test_nonbinary_trees_mutations(self):
ts = self.get_nonbinary_example(mutation_rate=2)
assert ts.num_trees > 2
assert ts.num_sites > 2
self.verify_edge_sort_offset(ts)
self.verify_sort_offset(ts)
self.verify_sort(ts.tables, 44)

def test_unknown_times(self):
Expand Down
12 changes: 10 additions & 2 deletions python/tskit/tables.py
Original file line number Diff line number Diff line change
Expand Up @@ -3111,7 +3111,7 @@ def map_ancestors(self, *args, **kwargs):
# A deprecated alias for link_ancestors()
return self.link_ancestors(*args, **kwargs)

def sort(self, edge_start=0):
def sort(self, edge_start=0, *, site_start=0, mutation_start=0):
"""
Sorts the tables in place. This ensures that all tree sequence ordering
requirements listed in the
Expand All @@ -3124,6 +3124,10 @@ def sort(self, edge_start=0):
are not affected. This parameter is provided to allow for efficient sorting
when the user knows that the edges up to a given index are already sorted.

If both ``site_start`` and ``mutation_start`` are equal to the number of rows
in their retrospective tables then neither is sorted. Note that a partial
non-sorting is not possible, and both or neither must be skipped.

The node, population and provenance tables are not affected
by this method.

Expand Down Expand Up @@ -3160,8 +3164,12 @@ def sort(self, edge_start=0):

:param int edge_start: The index in the edge table where sorting starts
(default=0; must be <= len(edges)).
:param int site_start: The index in the site table where sorting starts
(default=0; must be one of [0, len(sites)]).
:param int mutation_start: The index in the mutation table where sorting starts
(default=0; must be one of [0, len(mutations)]).
"""
self._ll_tables.sort(edge_start)
self._ll_tables.sort(edge_start, site_start, mutation_start)
# TODO add provenance

def sort_individuals(self):
Expand Down