From ce64c23618636b2a34463793bbf87f08de2e6651 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Wed, 20 Oct 2021 14:06:27 +0100 Subject: [PATCH] Allow skipping of sites and mutations in sort --- c/tests/test_tables.c | 10 +++++----- c/tskit/tables.c | 8 -------- python/CHANGELOG.rst | 5 ++++- python/_tskitmodule.c | 11 ++++++++--- python/tests/test_tables.py | 33 ++++++++++++++++++++++++--------- python/tskit/tables.py | 12 ++++++++++-- 6 files changed, 51 insertions(+), 28 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 67b1032ca5..15c8b8e1c6 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -6297,10 +6297,11 @@ test_sort_tables_offsets(void) CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); + /* Individual bookmark ignored */ tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.individuals = tables.individuals.num_rows - 1; ret = tsk_table_collection_sort(&tables, &bookmark, 0); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_SORT_OFFSET_NOT_SUPPORTED); + CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_table_collection_free(&tables); tsk_table_collection_free(©); @@ -6409,8 +6410,8 @@ test_sort_tables_errors(void) tsk_table_collection_t tables; tsk_bookmark_t pos; - tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, NULL, - NULL, NULL, NULL, 0); + tsk_treeseq_from_text(&ts, 1, single_tree_ex_nodes, single_tree_ex_edges, NULL, + single_tree_ex_sites, single_tree_ex_mutations, NULL, NULL, 0); ret = tsk_treeseq_copy_tables(&ts, &tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -6457,8 +6458,7 @@ test_sort_tables_errors(void) ret = tsk_table_collection_sort(&tables, &pos, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - /* Setting sites or mutations gives a BAD_PARAM. See - * github.com/tskit-dev/tskit/issues/101 */ + /* Specifying only one of sites or mutations is an error */ tsk_memset(&pos, 0, sizeof(pos)); pos.sites = 1; ret = tsk_table_collection_sort(&tables, &pos, 0); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 4e2c7333ad..e7215325fe 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6348,14 +6348,6 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; goto out; } - - /* Individuals also must be all sorted or skipped entirely */ - if (start->individuals == self->tables->individuals.num_rows) { - skip_individuals = true; - } else if (start->individuals != 0) { - ret = TSK_ERR_SORT_OFFSET_NOT_SUPPORTED; - goto out; - } } /* The indexes will be invalidated, so drop them */ ret = tsk_table_collection_drop_index(self->tables, 0); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 2534287e8c..19da8cbec5 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -39,8 +39,11 @@ **Features** +- Allow skipping of site and mutation tables in ``TableCollection.sort`` + (:user:`benjeffery`, :issue:`1475`, :pr:`1826`). + - Add ``TableCollection.sort_individuals`` to sort the individuals as this is no longer done by the - default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) + default sort (:user:`benjeffery`, :issue:`1774`, :pr:`1789`). - Add ``__setitem__`` to all tables allowing single rows to be updated. For example ``tables.nodes[0] = tables.nodes[0].replace(flags=tskit.NODE_IS_SAMPLE)`` diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 1f9c1997d4..35100f5488 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -6611,17 +6611,22 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) int err; PyObject *ret = NULL; Py_ssize_t edge_start = 0; + Py_ssize_t site_start = 0; + Py_ssize_t mutation_start = 0; tsk_bookmark_t start; - static char *kwlist[] = { "edge_start", NULL }; + static char *kwlist[] = { "edge_start", "site_start", "mutation_start", NULL }; if (TableCollection_check_state(self) != 0) { goto out; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "|n", kwlist, &edge_start)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "|nnn", kwlist, &edge_start, &site_start, &mutation_start)) { goto out; } memset(&start, 0, sizeof(start)); - start.edges = edge_start; + start.edges = (tsk_size_t) edge_start; + start.sites = (tsk_size_t) site_start; + start.mutations = (tsk_size_t) mutation_start; err = tsk_table_collection_sort(self->tables, &start, 0); if (err != 0) { handle_library_error(err); diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 78ca33adb4..7fe87c987a 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -2216,7 +2216,7 @@ def verify_sort(self, tables, seed): self.verify_sort_mutation_consistency(tables, seed) self.verify_randomise_tables(tables, seed) - def verify_edge_sort_offset(self, ts): + def verify_sort_offset(self, ts): """ Verifies the behaviour of the edge_start offset value. """ @@ -2225,7 +2225,6 @@ def verify_edge_sort_offset(self, ts): starts = [0] if len(edges) > 2: starts = [0, 1, len(edges) // 2, len(edges) - 2] - random.seed(self.random_seed) for start in starts: # Unsort the edges starting from index start all_edges = list(ts.edges()) @@ -2235,7 +2234,7 @@ def verify_edge_sort_offset(self, ts): tables.edges.clear() for e in all_edges: tables.edges.append(e) - # Verify that import fails for randomised edges + # Verify that import fails for reversed edges with pytest.raises(_tskit.LibraryError): tables.tree_sequence() # If we sort after the start value we should still fail. @@ -2250,6 +2249,22 @@ def verify_edge_sort_offset(self, ts): # Verify the new and old edges are equal. assert edges == tables.edges + tables.tree_sequence() + if len(tables.mutations) > 2: + mutations = tables.mutations.copy() + tables.mutations.clear() + for m in mutations[::-1]: + tables.mutations.append(m) + with pytest.raises(_tskit.LibraryError): + tables.tree_sequence() + tables.sort( + 0, site_start=len(tables.sites), mutation_start=len(tables.mutations) + ) + with pytest.raises(_tskit.LibraryError): + tables.tree_sequence() + tables.sort(0) + tables.tree_sequence() + def get_wf_example(self, seed): tables = wf.wf_sim( 6, @@ -2277,7 +2292,7 @@ def test_wf_example(self): def test_single_tree_no_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 432) def test_single_tree_no_mutations_metadata(self): @@ -2288,13 +2303,13 @@ def test_single_tree_no_mutations_metadata(self): def test_many_trees_no_mutations(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) assert ts.num_trees > 2 - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 31) def test_single_tree_mutations(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed) assert ts.num_sites > 2 - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 83) def test_single_tree_mutations_metadata(self): @@ -2320,7 +2335,7 @@ def test_many_trees_mutations(self): ) assert ts.num_trees > 2 assert ts.num_sites > 2 - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 173) def test_many_trees_multichar_mutations(self): @@ -2358,14 +2373,14 @@ def get_nonbinary_example(self, mutation_rate): def test_nonbinary_trees(self): ts = self.get_nonbinary_example(mutation_rate=0) - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 9182) def test_nonbinary_trees_mutations(self): ts = self.get_nonbinary_example(mutation_rate=2) assert ts.num_trees > 2 assert ts.num_sites > 2 - self.verify_edge_sort_offset(ts) + self.verify_sort_offset(ts) self.verify_sort(ts.tables, 44) def test_unknown_times(self): diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 738246e304..d973ebc1ee 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3111,7 +3111,7 @@ def map_ancestors(self, *args, **kwargs): # A deprecated alias for link_ancestors() return self.link_ancestors(*args, **kwargs) - def sort(self, edge_start=0): + def sort(self, edge_start=0, *, site_start=0, mutation_start=0): """ Sorts the tables in place. This ensures that all tree sequence ordering requirements listed in the @@ -3124,6 +3124,10 @@ def sort(self, edge_start=0): are not affected. This parameter is provided to allow for efficient sorting when the user knows that the edges up to a given index are already sorted. + If both ``site_start`` and ``mutation_start`` are equal to the number of rows + in their retrospective tables then neither is sorted. Note that a partial + non-sorting is not possible, and both or neither must be skipped. + The node, population and provenance tables are not affected by this method. @@ -3160,8 +3164,12 @@ def sort(self, edge_start=0): :param int edge_start: The index in the edge table where sorting starts (default=0; must be <= len(edges)). + :param int site_start: The index in the site table where sorting starts + (default=0; must be one of [0, len(sites)]). + :param int mutation_start: The index in the mutation table where sorting starts + (default=0; must be one of [0, len(mutations)]). """ - self._ll_tables.sort(edge_start) + self._ll_tables.sort(edge_start, site_start, mutation_start) # TODO add provenance def sort_individuals(self):