From d13c1601fb7761902044b9ccb47800f3adc5c1bf Mon Sep 17 00:00:00 2001 From: mufernando Date: Thu, 9 Jul 2020 12:57:29 -0300 Subject: [PATCH 1/4] py sort and tsutil to disorder ts --- python/tests/tsutil.py | 105 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 105 insertions(+) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index b76acdad9c..5a0e84870d 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -680,6 +680,111 @@ def compute_mutation_times(ts): return times +def disorder_ts(ts, seed): + """ + Randomizes the order the Edge, Site and Mutation Tables rows. + :param TreeSequence ts: The tree sequence to disorders the tables from. + Need not have a valid mutation time column. + :return: Table Collection based off of `ts` in which edges, sites and + mutations were randomly shuffled. + """ + rng = random.Random(seed) + tables = ts.dump_tables() + randomised_edges = list(ts.edges()) + rng.shuffle(randomised_edges) + tables.edges.clear() + for e in randomised_edges: + tables.edges.add_row(e.left, e.right, e.parent, e.child) + tables.sites.clear() + tables.mutations.clear() + randomised_sites = list(ts.sites()) + rng.shuffle(randomised_sites) + # Maps original IDs into their indexes in the randomised table. + site_id_map = {} + randomised_mutations = [] + for s in randomised_sites: + site_id_map[s.id] = tables.sites.add_row( + s.position, ancestral_state=s.ancestral_state, metadata=s.metadata + ) + randomised_mutations.extend(s.mutations) + rng.shuffle(randomised_mutations) + for m in randomised_mutations: + tables.mutations.add_row( + site=site_id_map[m.site], + node=m.node, + derived_state=m.derived_state, + parent=m.parent, + metadata=m.metadata, + # time=m.time, + ) + return tables + + +def orig_edge_keys(i, tables): + edge = tables.edges[i] + parent_time = tables.nodes.time[edge.parent] + return parent_time, edge.parent, edge.child, edge.left + + +def orig_site_keys(i, tables): + return tables.sites.position[i] + + +def orig_mut_keys(i, tables, sorted_sites): + orig_site = tables.mutations.site[i] + return sorted_sites.index(orig_site) + + +def new_mut_keys(i, tables, sorted_sites): + orig_site = tables.mutations.site[i] + return ( + sorted_sites.index(orig_site), + tables.mutations.parent[i], + tables.mutations.node[i], + ) + + +def py_sort( + tables, edge_keys=orig_edge_keys, site_keys=orig_site_keys, mut_keys=orig_mut_keys +): + copy = tables.copy() + tables.edges.clear() + tables.sites.clear() + tables.mutations.clear() + sorted_edges = sorted( + range(copy.edges.num_rows), key=lambda x: edge_keys(x, tables=copy) + ) + sorted_sites = sorted( + range(copy.sites.num_rows), key=lambda x: site_keys(x, tables=copy) + ) + sorted_muts = sorted( + range(copy.mutations.num_rows), + key=lambda x: mut_keys(x, tables=copy, sorted_sites=sorted_sites), + ) + for edge_id in sorted_edges: + tables.edges.add_row( + copy.edges[edge_id].left, + copy.edges[edge_id].right, + copy.edges[edge_id].parent, + copy.edges[edge_id].child, + ) + for site_id in sorted_sites: + tables.sites.add_row( + copy.sites[site_id].position, + copy.sites[site_id].ancestral_state, + copy.sites[site_id].metadata, + ) + for mut_id in sorted_muts: + tables.mutations.add_row( + sorted_sites.index(copy.mutations[mut_id].site), + copy.mutations[mut_id].node, + copy.mutations[mut_id].derived_state, + copy.mutations[mut_id].parent, + copy.mutations[mut_id].metadata, + # copy.mutations[mut_id].time, + ) + + def algorithm_T(ts): """ Simple implementation of algorithm T from the PLOS paper, taking into From 3e8bd6ee13328ccc0d8a18f279560abb93851fa8 Mon Sep 17 00:00:00 2001 From: mufernando Date: Thu, 9 Jul 2020 17:41:07 -0300 Subject: [PATCH 2/4] mut time --- python/tests/tsutil.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 5a0e84870d..9f9525d143 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -715,7 +715,7 @@ def disorder_ts(ts, seed): derived_state=m.derived_state, parent=m.parent, metadata=m.metadata, - # time=m.time, + time=m.time, ) return tables @@ -735,10 +735,12 @@ def orig_mut_keys(i, tables, sorted_sites): return sorted_sites.index(orig_site) -def new_mut_keys(i, tables, sorted_sites): +def new_mut_keys(i, tables, sorted_sites, time_in_key): orig_site = tables.mutations.site[i] + time = tables.mutations.time[i] if time_in_key else 0 return ( sorted_sites.index(orig_site), + time, tables.mutations.parent[i], tables.mutations.node[i], ) @@ -751,6 +753,7 @@ def py_sort( tables.edges.clear() tables.sites.clear() tables.mutations.clear() + time_in_key = np.all(~np.isnan(copy.mutations.time)) sorted_edges = sorted( range(copy.edges.num_rows), key=lambda x: edge_keys(x, tables=copy) ) @@ -759,7 +762,9 @@ def py_sort( ) sorted_muts = sorted( range(copy.mutations.num_rows), - key=lambda x: mut_keys(x, tables=copy, sorted_sites=sorted_sites), + key=lambda x: mut_keys( + x, tables=copy, sorted_sites=sorted_sites, time_in_key=time_in_key + ), ) for edge_id in sorted_edges: tables.edges.add_row( @@ -781,7 +786,7 @@ def py_sort( copy.mutations[mut_id].derived_state, copy.mutations[mut_id].parent, copy.mutations[mut_id].metadata, - # copy.mutations[mut_id].time, + copy.mutations[mut_id].time, ) From f367929b3a7ca15730b7c6283c7b8f9b49a8c43b Mon Sep 17 00:00:00 2001 From: mufernando Date: Fri, 10 Jul 2020 12:27:35 -0300 Subject: [PATCH 3/4] allow kwargs in orig_mut_keys --- python/tests/tsutil.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 9f9525d143..dc52da62ca 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -730,7 +730,7 @@ def orig_site_keys(i, tables): return tables.sites.position[i] -def orig_mut_keys(i, tables, sorted_sites): +def orig_mut_keys(i, tables, sorted_sites, **kwargs): orig_site = tables.mutations.site[i] return sorted_sites.index(orig_site) From c543d3e381750fb2dc98a0c5fc310b1e66c3de9b Mon Sep 17 00:00:00 2001 From: mufernando Date: Fri, 10 Jul 2020 12:37:17 -0300 Subject: [PATCH 4/4] verifying C sort against Py sort --- python/tests/test_tables.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 1469505701..e6b0541094 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1233,6 +1233,13 @@ class TestSortTables(unittest.TestCase): random_seed = 12345 + def verify_sort_equality(self, ts, seed): + tables = tsutil.disorder_ts(ts, seed) + copy = tables.copy() + tables.sort() + tsutil.py_sort(copy) + self.assertEqual(tables, copy) + def verify_randomise_tables(self, ts): tables = ts.dump_tables() @@ -1318,40 +1325,47 @@ def test_single_tree_no_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 432) def test_single_tree_no_mutations_metadata(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 12) def test_many_trees_no_mutations(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) self.assertGreater(ts.num_trees, 2) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 31) def test_single_tree_mutations(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed) self.assertGreater(ts.num_sites, 2) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 83) def test_single_tree_mutations_metadata(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed) self.assertGreater(ts.num_sites, 2) ts = tsutil.add_random_metadata(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 923) def test_single_tree_multichar_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.insert_multichar_mutations(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 35) def test_single_tree_multichar_mutations_metadata(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.insert_multichar_mutations(ts, self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 2175) def test_many_trees_mutations(self): ts = msprime.simulate( @@ -1361,12 +1375,14 @@ def test_many_trees_mutations(self): self.assertGreater(ts.num_sites, 2) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 173) def test_many_trees_multichar_mutations(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) self.assertGreater(ts.num_trees, 2) ts = tsutil.insert_multichar_mutations(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 16) def test_many_trees_multichar_mutations_metadata(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) @@ -1374,6 +1390,7 @@ def test_many_trees_multichar_mutations_metadata(self): ts = tsutil.insert_multichar_mutations(ts, self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) self.verify_randomise_tables(ts) + self.verify_sort_equality(ts, 91) def get_nonbinary_example(self, mutation_rate): ts = msprime.simulate( @@ -1399,6 +1416,7 @@ def test_nonbinary_trees(self): self.assertGreater(ts.num_trees, 2) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 9182) def test_nonbinary_trees_mutations(self): ts = self.get_nonbinary_example(mutation_rate=2) @@ -1406,6 +1424,7 @@ def test_nonbinary_trees_mutations(self): self.assertGreater(ts.num_sites, 2) self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort_equality(ts, 44) def test_incompatible_edges(self): ts1 = msprime.simulate(10, random_seed=self.random_seed)