From 5bfe9cd86eb6bbcee72d7a9b7b8ffda9bd19528a Mon Sep 17 00:00:00 2001 From: peter Date: Thu, 24 Dec 2020 09:20:55 -0800 Subject: [PATCH 1/6] removed restriction on union assert equal method (closes #1076) add keep_unused arguments to subset py sort and tsutil to disorder ts verifying C sort against Py sort shuffle tables sphinx fixup for subset options extend subset to work on unsorted tables canonicalalise (closes #705) test unrestricted union --- c/tests/test_tables.c | 419 ++++++++++++++++++++++++++++++++-- c/tskit/core.c | 4 - c/tskit/core.h | 1 - c/tskit/tables.c | 318 ++++++++++++++++++++++---- c/tskit/tables.h | 56 ++++- python/CHANGELOG.rst | 7 + python/_tskitmodule.c | 52 ++++- python/tests/test_tables.py | 444 ++++++++++++++++++++++++++++-------- python/tests/tsutil.py | 350 +++++++++++++++++++++++++++- python/tskit/tables.py | 57 ++++- python/tskit/trees.py | 67 ++++-- 11 files changed, 1571 insertions(+), 204 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 6a75a92818..1ed130c946 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -4050,6 +4050,106 @@ test_sort_tables_mutation_times(void) tsk_treeseq_free(&ts); } +static void +test_sort_tables_canonical(void) +{ + int ret; + tsk_table_collection_t t1, t2; + // this is single_tree_ex with individuals and populations + const char *nodes = "1 0 -1 1\n" + "1 0 2 3\n" + "1 0 0 -1\n" + "1 0 -1 3\n" + "0 1 2 -1\n" + "0 2 -1 2\n" + "0 3 -1 -1\n"; + const char *individuals = "0 0.0\n" + "0 1.0\n" + "0 2.0\n" + "0 3.0\n"; + const char *sites = "0 0\n" + "0.2 0\n" + "0.1 0\n"; + const char *mutations = "0 0 2 3 0.5\n" + "2 1 1 -1 0.5\n" + "1 4 3 -1 3\n" + "0 4 1 -1 2.5\n" + "2 2 1 -1 2\n" + "1 1 5 7 0.5\n" + "1 2 1 -1 2\n" + "1 1 4 2 0.5\n"; + const char *nodes_sorted = "1 0 -1 0\n" + "1 0 0 1\n" + "1 0 1 -1\n" + "1 0 -1 1\n" + "0 1 0 -1\n" + "0 2 -1 2\n" + "0 3 -1 -1\n"; + const char *individuals_sorted = "0 1.0\n" + "0 3.0\n" + "0 2.0\n" + "0 0.0\n"; + const char *sites_sorted = "0 0\n" + "0.1 0\n" + "0.2 0\n"; + const char *mutations_sorted = "0 4 1 -1 2.5\n" + "0 0 2 0 0.5\n" + "1 2 1 -1 2\n" + "1 1 1 -1 0.5\n" + "2 4 3 -1 3\n" + "2 2 1 -1 2\n" + "2 1 4 4 0.5\n" + "2 1 5 6 0.5\n"; + + ret = tsk_table_collection_init(&t1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + t1.sequence_length = 1.0; + ret = tsk_table_collection_init(&t2, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + t2.sequence_length = 1.0; + + ret = tsk_table_collection_clear(&t1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_clear(&t2, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + parse_nodes(nodes, &t1.nodes); + CU_ASSERT_EQUAL_FATAL(t1.nodes.num_rows, 7); + parse_individuals(individuals, &t1.individuals); + CU_ASSERT_EQUAL_FATAL(t1.individuals.num_rows, 4); + tsk_population_table_add_row(&t1.populations, "A", 1); + tsk_population_table_add_row(&t1.populations, "B", 1); + tsk_population_table_add_row(&t1.populations, "C", 1); + parse_edges(single_tree_ex_edges, &t1.edges); + CU_ASSERT_EQUAL_FATAL(t1.edges.num_rows, 6); + parse_sites(sites, &t1.sites); + CU_ASSERT_EQUAL_FATAL(t1.sites.num_rows, 3); + parse_mutations(mutations, &t1.mutations); + CU_ASSERT_EQUAL_FATAL(t1.mutations.num_rows, 8); + + ret = tsk_table_collection_canonicalise(&t1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + parse_nodes(nodes_sorted, &t2.nodes); + tsk_population_table_add_row(&t2.populations, "C", 1); + tsk_population_table_add_row(&t2.populations, "A", 1); + tsk_population_table_add_row(&t2.populations, "B", 1); + CU_ASSERT_EQUAL_FATAL(t2.nodes.num_rows, 7); + parse_individuals(individuals_sorted, &t2.individuals); + CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 4); + parse_edges(single_tree_ex_edges, &t2.edges); + CU_ASSERT_EQUAL_FATAL(t2.edges.num_rows, 6); + parse_sites(sites_sorted, &t2.sites); + parse_mutations(mutations_sorted, &t2.mutations); + CU_ASSERT_EQUAL_FATAL(t2.sites.num_rows, 3); + CU_ASSERT_EQUAL_FATAL(t2.mutations.num_rows, 8); + + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); + + tsk_table_collection_free(&t2); + tsk_table_collection_free(&t1); +} + static void test_sorter_interface(void) { @@ -4997,7 +5097,7 @@ test_table_collection_subset_with_options(tsk_flags_t options) CU_ASSERT_EQUAL_FATAL(ret, 0); // does not error on empty tables - ret = tsk_table_collection_subset(&tables, NULL, 0); + ret = tsk_table_collection_subset(&tables, NULL, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); // four nodes from two diploids; the first is from pop 0 @@ -5017,6 +5117,12 @@ test_table_collection_subset_with_options(tsk_flags_t options) ret = tsk_individual_table_add_row( &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); + // unused individual + ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_population_table_add_row(&tables.populations, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + // unused population ret = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); @@ -5027,6 +5133,9 @@ test_table_collection_subset_with_options(tsk_flags_t options) CU_ASSERT_FATAL(ret >= 0); ret = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); CU_ASSERT_FATAL(ret >= 0); + // unused site + ret = tsk_site_table_add_row(&tables.sites, 0.5, "C", 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); ret = tsk_mutation_table_add_row( &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); @@ -5036,37 +5145,88 @@ test_table_collection_subset_with_options(tsk_flags_t options) ret = tsk_mutation_table_add_row( &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_table_collection_build_index(&tables, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); // empty nodes should get empty tables ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, NULL, 0); + ret = tsk_table_collection_subset(&tables_copy, NULL, 0, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); - // the identity transformation + // unless FILTER_POPULATIONS is not provided + ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_subset( + &tables_copy, NULL, 0, TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); + CU_ASSERT_FATAL( + tsk_population_table_equals(&tables.populations, &tables_copy.populations, 0)); + + // or FILTER_INDIVIDUALS + ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_subset( + &tables_copy, NULL, 0, TSK_FILTER_POPULATIONS | TSK_FILTER_SITES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 3); + CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); + + // or KEEP_UNUSED_SITES + ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_subset( + &tables_copy, NULL, 0, TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); + CU_ASSERT_FATAL(tsk_site_table_equals(&tables.sites, &tables_copy.sites, 0)); + + // the identity transformation, since unused inds/pops are at the end for (k = 0; k < 4; k++) { nodes[k] = k; } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); - // reverse twice should get back to the start + // or, remove unused things: + ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_node_table_equals(&tables.nodes, &tables_copy.nodes, 0)); + CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 2); + CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 1); + CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 2); + CU_ASSERT_FATAL( + tsk_mutation_table_equals(&tables.mutations, &tables_copy.mutations, 0)); + + // reverse twice should get back to the start, since unused inds/pops are at the end for (k = 0; k < 4; k++) { nodes[k] = 3 - k; } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); @@ -5081,6 +5241,64 @@ test_table_collection_subset(void) test_table_collection_subset_with_options(TSK_NO_EDGE_METADATA); } +static void +test_table_collection_subset_unsorted(void) +{ + int ret; + tsk_table_collection_t tables; + tsk_table_collection_t tables_copy; + int k; + tsk_id_t nodes[3]; + + ret = tsk_table_collection_init(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tables.sequence_length = 1; + ret = tsk_table_collection_init(&tables_copy, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + // these tables are a big mess + ret = tsk_node_table_add_row( + &tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, TSK_NULL, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_node_table_add_row( + &tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, TSK_NULL, TSK_NULL, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_node_table_add_row(&tables.nodes, 0, 1.0, TSK_NULL, TSK_NULL, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_edge_table_add_row(&tables.edges, 0.0, 0.5, 2, 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_edge_table_add_row(&tables.edges, 0.5, 1.0, 2, 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, 2, TSK_UNKNOWN_TIME, "B", 1, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + ret = tsk_mutation_table_add_row( + &tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT_FATAL(ret >= 0); + + // but still, this should leave them unchanged + for (k = 0; k < 3; k++) { + nodes[k] = k; + } + ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_subset(&tables_copy, nodes, 3, TSK_CANONICALISE); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); + + tsk_table_collection_free(&tables_copy); + tsk_table_collection_free(&tables); +} + static void test_table_collection_subset_errors(void) { @@ -5124,19 +5342,19 @@ test_table_collection_subset_errors(void) CU_ASSERT_EQUAL_FATAL(ret, 0); tsk_migration_table_add_row(&tables_copy.migrations, 0, 1, 0, 0, 0, 0, NULL, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.migrations.num_rows, 1); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); // test out of bounds nodes ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); nodes[0] = -1; - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); nodes[0] = 6; ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS); // check integrity @@ -5149,7 +5367,7 @@ test_table_collection_subset_errors(void) ret = tsk_node_table_add_row( &tables_copy.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS); tsk_table_collection_free(&tables); @@ -5251,7 +5469,8 @@ test_table_collection_union(void) node_mapping[0] = 0; node_mapping[1] = 1; node_mapping[2] = 2; - ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3); + ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); @@ -5294,6 +5513,164 @@ test_table_collection_union(void) tsk_table_collection_free(&tables); } +static void +test_table_collection_union_middle_merge(void) +{ + /* Test ability to have non-shared history both above and below the + * shared bits. The full genealogy, in `tu`, is: + * 3 4 + * \ / + * 2 + * / \ + * 0 1 + * and the left lineage is in `ta` and right in `tb` */ + int ret; + tsk_id_t node_mapping[] = { TSK_NULL, 1, TSK_NULL }; + tsk_id_t node_order[] = { 0, 3, 1, 2, 4 }; + tsk_table_collection_t ta, tb, tu; + ret = tsk_table_collection_init(&ta, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ta.sequence_length = 1; + ret = tsk_table_collection_init(&tb, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tb.sequence_length = 1; + ret = tsk_table_collection_init(&tu, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tu.sequence_length = 1; + + ret = tsk_node_table_add_row( + &tu.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node u0 + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &ta.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node a0 = u0 + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tu.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node u1 + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tb.nodes, TSK_NODE_IS_SAMPLE, 0, TSK_NULL, TSK_NULL, NULL, 0); // node b0 = u1 + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tu.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node u2 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tu.edges, 0, 1, 2, 0, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tu.edges, 0, 1, 2, 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &ta.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node a1 = u2 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&ta.edges, 0, 1, 1, 0, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tb.nodes, 0, 1, TSK_NULL, TSK_NULL, NULL, 0); // node b1 = u2 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tb.edges, 0, 1, 1, 0, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tu.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node u3 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tu.edges, 0, 0.5, 3, 2, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &ta.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node a2 = u3 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&ta.edges, 0, 0.5, 2, 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tu.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node u4 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tu.edges, 0.5, 1, 4, 2, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_node_table_add_row( + &tb.nodes, 0, 2, TSK_NULL, TSK_NULL, NULL, 0); // node b2 = u4 + CU_ASSERT(ret >= 0); + ret = tsk_edge_table_add_row(&tb.edges, 0.5, 1, 2, 1, NULL, 0); + CU_ASSERT(ret >= 0); + + ret = tsk_site_table_add_row(&ta.sites, 0.25, "A", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_site_table_add_row(&ta.sites, 0.75, "X", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_site_table_add_row(&tb.sites, 0.25, "A", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_site_table_add_row(&tb.sites, 0.75, "X", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_site_table_add_row(&tu.sites, 0.25, "A", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_site_table_add_row(&tu.sites, 0.75, "X", 1, NULL, 0); + CU_ASSERT(ret >= 0); + + ret = tsk_mutation_table_add_row( + &tu.mutations, 0, 3, TSK_NULL, 3.5, "B", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &ta.mutations, 0, 2, TSK_NULL, 3.5, "B", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tu.mutations, 0, 2, TSK_NULL, 1.5, "D", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &ta.mutations, 0, 1, TSK_NULL, 1.5, "D", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tb.mutations, 0, 1, TSK_NULL, 1.5, "D", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tu.mutations, 0, 2, TSK_NULL, 1.2, "E", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &ta.mutations, 0, 1, TSK_NULL, 1.2, "E", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tb.mutations, 0, 1, TSK_NULL, 1.2, "E", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tu.mutations, 0, 0, TSK_NULL, 0.5, "C", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &ta.mutations, 0, 0, TSK_NULL, 0.5, "C", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tu.mutations, 1, 4, TSK_NULL, 2.4, "Y", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tb.mutations, 1, 2, TSK_NULL, 2.4, "Y", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tu.mutations, 1, 1, TSK_NULL, 0.4, "Z", 1, NULL, 0); + CU_ASSERT(ret >= 0); + ret = tsk_mutation_table_add_row( + &tb.mutations, 1, 0, TSK_NULL, 0.4, "Z", 1, NULL, 0); + CU_ASSERT(ret >= 0); + + ret = tsk_table_collection_build_index(&ta, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_compute_mutation_parents(&ta, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_build_index(&tb, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_compute_mutation_parents(&tb, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_build_index(&tu, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_compute_mutation_parents(&tu, 0); + CU_ASSERT_EQUAL(ret, 0); + + ret = tsk_table_collection_union(&ta, &tb, node_mapping, 0); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_subset(&ta, node_order, 5, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_provenance_table_clear(&ta.provenances); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tu, &ta, 0)); + + tsk_table_collection_free(&ta); + tsk_table_collection_free(&tb); + tsk_table_collection_free(&tu); +} + static void test_table_collection_union_errors(void) { @@ -5349,15 +5726,6 @@ test_table_collection_union_errors(void) &tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED); - // unsuported union - child shared parent not shared - ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); - CU_ASSERT_EQUAL_FATAL(ret, 0); - node_mapping[0] = 0; - node_mapping[1] = TSK_NULL; - ret = tsk_table_collection_union( - &tables_copy, &tables, node_mapping, TSK_UNION_NO_ADD_POP); - CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_NOT_SUPPORTED); - // test out of bounds node_mapping node_mapping[0] = -4; node_mapping[1] = 6; @@ -5552,6 +5920,7 @@ main(int argc, char **argv) { "test_edge_update_invalidates_index", test_edge_update_invalidates_index }, { "test_copy_table_collection", test_copy_table_collection }, { "test_sort_tables_errors", test_sort_tables_errors }, + { "test_sort_tables_canonical", test_sort_tables_canonical }, { "test_sorter_interface", test_sorter_interface }, { "test_dump_unindexed", test_dump_unindexed }, { "test_dump_load_empty", test_dump_load_empty }, @@ -5566,8 +5935,12 @@ main(int argc, char **argv) { "test_table_collection_check_integrity_no_populations", test_table_collection_check_integrity_no_populations }, { "test_table_collection_subset", test_table_collection_subset }, + { "test_table_collection_subset_unsorted", + test_table_collection_subset_unsorted }, { "test_table_collection_subset_errors", test_table_collection_subset_errors }, { "test_table_collection_union", test_table_collection_union }, + { "test_table_collection_union_middle_merge", + test_table_collection_union_middle_merge }, { "test_table_collection_union_errors", test_table_collection_union_errors }, { "test_table_collection_clear", test_table_collection_clear }, { NULL, NULL }, diff --git a/c/tskit/core.c b/c/tskit/core.c index 6348a21849..42ea25e37a 100644 --- a/c/tskit/core.c +++ b/c/tskit/core.c @@ -357,10 +357,6 @@ tsk_strerror_internal(int err) case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED: ret = "Only binary mutations are supported for this operation"; break; - case TSK_ERR_UNION_NOT_SUPPORTED: - ret = "Union is not supported for cases where there is non-shared" - "history older than the shared history of the two Table Collections"; - break; /* Stats errors */ case TSK_ERR_BAD_NUM_WINDOWS: diff --git a/c/tskit/core.h b/c/tskit/core.h index 93dafaddce..da11798303 100644 --- a/c/tskit/core.h +++ b/c/tskit/core.h @@ -274,7 +274,6 @@ not found in the file. #define TSK_ERR_SORT_OFFSET_NOT_SUPPORTED -803 #define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804 #define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805 -#define TSK_ERR_UNION_NOT_SUPPORTED -806 /* Stats errors */ #define TSK_ERR_BAD_NUM_WINDOWS -900 diff --git a/c/tskit/tables.c b/c/tskit/tables.c index e0953a3ed4..106baeee47 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4804,7 +4804,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) if (ret != 0) { goto out; } - ret = tsk_table_sorter_sort_mutations(self); + ret = self->sort_mutations(self); if (ret != 0) { goto out; } @@ -4838,8 +4838,9 @@ tsk_table_sorter_init( goto out; } - /* Set the sort_edges method to the default. */ + /* Set the sort_edges and sort_mutations methods to the default. */ self->sort_edges = tsk_table_sorter_sort_edges; + self->sort_mutations = tsk_table_sorter_sort_mutations; out: return ret; } @@ -4851,6 +4852,153 @@ tsk_table_sorter_free(tsk_table_sorter_t *self) return 0; } +/****************** + * Canonical sort + ******************/ + +typedef struct { + tsk_mutation_t mut; + int num_descendants; +} mutation_canonical_sort_t; + +static int +cmp_mutation_canonical(const void *a, const void *b) +{ + const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; + const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; + /* Compare mutations by site */ + int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); + if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) + && !tsk_is_unknown_time(ib->mut.time)) { + ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); + } + if (ret == 0) { + ret = (ia->num_descendants < ib->num_descendants) + - (ia->num_descendants > ib->num_descendants); + } + if (ret == 0) { + ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); + } + if (ret == 0) { + ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); + } + return ret; +} + +static int +tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t parent, mapped_parent, p; + tsk_mutation_table_t *mutations = &self->tables->mutations; + tsk_size_t num_mutations = mutations->num_rows; + tsk_mutation_table_t copy; + mutation_canonical_sort_t *sorted_mutations + = malloc(num_mutations * sizeof(*sorted_mutations)); + tsk_id_t *mutation_id_map = malloc(num_mutations * sizeof(*mutation_id_map)); + + ret = tsk_mutation_table_copy(mutations, ©, 0); + if (ret != 0) { + goto out; + } + if (mutation_id_map == NULL || sorted_mutations == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + /* compute numbers of descendants for each mutation */ + for (j = 0; j < num_mutations; j++) { + sorted_mutations[j].num_descendants = 0; + } + for (j = 0; j < num_mutations; j++) { + p = mutations->parent[j]; + while (p != TSK_NULL) { + sorted_mutations[p].num_descendants += 1; + p = mutations->parent[p]; + } + } + + for (j = 0; j < num_mutations; j++) { + tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); + sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; + } + ret = tsk_mutation_table_clear(mutations); + if (ret != 0) { + goto out; + } + + qsort(sorted_mutations, num_mutations, sizeof(*sorted_mutations), + cmp_mutation_canonical); + + /* Make a first pass through the sorted mutations to build the ID map. */ + for (j = 0; j < num_mutations; j++) { + mutation_id_map[sorted_mutations[j].mut.id] = (tsk_id_t) j; + } + + for (j = 0; j < num_mutations; j++) { + mapped_parent = TSK_NULL; + parent = sorted_mutations[j].mut.parent; + if (parent != TSK_NULL) { + mapped_parent = mutation_id_map[parent]; + } + ret = tsk_mutation_table_add_row(mutations, sorted_mutations[j].mut.site, + sorted_mutations[j].mut.node, mapped_parent, sorted_mutations[j].mut.time, + sorted_mutations[j].mut.derived_state, + sorted_mutations[j].mut.derived_state_length, + sorted_mutations[j].mut.metadata, sorted_mutations[j].mut.metadata_length); + if (ret < 0) { + goto out; + } + } + ret = 0; + +out: + tsk_safe_free(mutation_id_map); + tsk_safe_free(sorted_mutations); + tsk_mutation_table_free(©); + return ret; +} + +int TSK_WARN_UNUSED +tsk_table_collection_canonicalise( + tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) +{ + int ret = 0; + tsk_id_t k; + tsk_id_t *nodes = NULL; + tsk_table_sorter_t sorter; + + ret = tsk_table_sorter_init(&sorter, self, 0); + if (ret != 0) { + goto out; + } + + nodes = malloc(self->nodes.num_rows * sizeof(*nodes)); + if (nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { + nodes[k] = k; + } + ret = tsk_table_collection_subset( + self, nodes, self->nodes.num_rows, TSK_CANONICALISE); + if (ret != 0) { + goto out; + } + sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; + + ret = tsk_table_sorter_run(&sorter, NULL); + if (ret != 0) { + goto out; + } +out: + tsk_safe_free(nodes); + tsk_table_sorter_free(&sorter); + return ret; +} + /************************* * segment overlapper *************************/ @@ -8781,6 +8929,7 @@ tsk_table_collection_sort( if (ret != 0) { goto out; } + ret = tsk_table_sorter_run(&sorter, start); if (ret != 0) { goto out; @@ -9291,20 +9440,26 @@ tsk_table_collection_add_and_remap_node(tsk_table_collection_t *self, } int TSK_WARN_UNUSED -tsk_table_collection_subset( - tsk_table_collection_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes) +tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, + tsk_size_t num_nodes, tsk_flags_t options) { int ret = 0; - tsk_id_t k, i, new_parent, new_child, new_node; + tsk_id_t j, k, new_parent, new_child, new_node, site_id; tsk_edge_t edge; - tsk_mutation_t mut; - tsk_site_t site; tsk_id_t *node_map = NULL; tsk_id_t *individual_map = NULL; tsk_id_t *population_map = NULL; tsk_id_t *site_map = NULL; tsk_id_t *mutation_map = NULL; tsk_table_collection_t tables; + tsk_individual_t ind; + tsk_population_t pop; + tsk_site_t site; + tsk_mutation_t mut; + bool canonicalise = !!(options & TSK_CANONICALISE); + bool no_filter_populations = !(options & TSK_FILTER_POPULATIONS); + bool no_filter_individuals = !(options & TSK_FILTER_INDIVIDUALS); + bool no_filter_sites = !(options & TSK_FILTER_SITES); ret = tsk_table_collection_copy(self, &tables, 0); if (ret != 0) { @@ -9335,6 +9490,27 @@ tsk_table_collection_subset( memset(site_map, 0xff, tables.sites.num_rows * sizeof(*site_map)); memset(mutation_map, 0xff, tables.mutations.num_rows * sizeof(*mutation_map)); + if (no_filter_individuals && !canonicalise) { + ret = tsk_individual_table_copy( + &tables.individuals, &self->individuals, TSK_NO_INIT); + if (ret < 0) { + goto out; + } + for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { + individual_map[k] = k; + } + } + if (no_filter_populations && !canonicalise) { + ret = tsk_population_table_copy( + &tables.populations, &self->populations, TSK_NO_INIT); + if (ret < 0) { + goto out; + } + for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { + population_map[k] = k; + } + } + // nodes, individuals, populations for (k = 0; k < (tsk_id_t) num_nodes; k++) { ret = tsk_table_collection_add_and_remap_node( @@ -9343,6 +9519,30 @@ tsk_table_collection_subset( goto out; } } + if (canonicalise) { + // keep unused individuals and populations + for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { + if (individual_map[k] == TSK_NULL) { + tsk_individual_table_get_row_unsafe(&tables.individuals, k, &ind); + ret = tsk_individual_table_add_row(&self->individuals, ind.flags, + ind.location, ind.location_length, ind.metadata, + ind.metadata_length); + if (ret < 0) { + goto out; + } + } + } + for (k = 0; k < (tsk_id_t) tables.populations.num_rows; k++) { + if (population_map[k] == TSK_NULL) { + tsk_population_table_get_row_unsafe(&tables.populations, k, &pop); + ret = tsk_population_table_add_row( + &self->populations, pop.metadata, pop.metadata_length); + if (ret < 0) { + goto out; + } + } + } + } // edges for (k = 0; k < (tsk_id_t) tables.edges.num_rows; k++) { @@ -9359,36 +9559,53 @@ tsk_table_collection_subset( } // mutations and sites - i = 0; + // make a first pass through to build the mutation_map so that + // mutation parent can be remapped even if the table is not in order + j = 0; + for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { + if (node_map[tables.mutations.node[k]] != TSK_NULL) { + mutation_map[k] = j; + j++; + site_id = tables.mutations.site[k]; + if (site_map[site_id] == TSK_NULL) { + // insert a sentinel non-NULL value: + site_map[site_id] = 1; + } + } + } + // keep retained sites in their original order + j = 0; for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) { - tsk_site_table_get_row_unsafe(&tables.sites, k, &site); - while ((i < (tsk_id_t) tables.mutations.num_rows) - && (tables.mutations.site[i] == site.id)) { - tsk_mutation_table_get_row_unsafe(&tables.mutations, i, &mut); - new_node = node_map[mut.node]; - if (new_node != TSK_NULL) { - if (site_map[site.id] == TSK_NULL) { - ret = tsk_site_table_add_row(&self->sites, site.position, - site.ancestral_state, site.ancestral_state_length, site.metadata, - site.metadata_length); - if (ret < 0) { - goto out; - } - site_map[site.id] = ret; - } - new_parent = TSK_NULL; - if (mut.parent != TSK_NULL) { - new_parent = mutation_map[mut.parent]; - } - ret = tsk_mutation_table_add_row(&self->mutations, site_map[site.id], - new_node, new_parent, mut.time, mut.derived_state, - mut.derived_state_length, mut.metadata, mut.metadata_length); - if (ret < 0) { - goto out; - } - mutation_map[mut.id] = ret; + if (canonicalise || no_filter_sites || site_map[k] != TSK_NULL) { + tsk_site_table_get_row_unsafe(&tables.sites, k, &site); + ret = tsk_site_table_add_row(&self->sites, site.position, + site.ancestral_state, site.ancestral_state_length, site.metadata, + site.metadata_length); + if (ret < 0) { + goto out; } - i++; + site_map[k] = j; + j++; + } + } + for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { + tsk_mutation_table_get_row_unsafe(&tables.mutations, k, &mut); + new_node = node_map[mut.node]; + if (new_node != TSK_NULL) { + new_parent = TSK_NULL; + if (mut.parent != TSK_NULL) { + new_parent = mutation_map[mut.parent]; + } + ret = tsk_mutation_table_add_row(&self->mutations, site_map[mut.site], + new_node, new_parent, mut.time, mut.derived_state, + mut.derived_state_length, mut.metadata, mut.metadata_length); + if (ret < 0) { + goto out; + } + tsk_bug_assert(mutation_map[mut.id] == ret); + } + if (ret < 0) { + goto out; } } @@ -9441,7 +9658,6 @@ tsk_check_subset_equality(tsk_table_collection_t *self, } } - // TODO: strict sort before checking equality ret = tsk_table_collection_copy(self, &self_copy, 0); if (ret != 0) { goto out; @@ -9450,11 +9666,21 @@ tsk_check_subset_equality(tsk_table_collection_t *self, if (ret != 0) { goto out; } - ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes); + ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes, + TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_canonicalise(&self_copy, 0); if (ret != 0) { goto out; } - ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes); + ret = tsk_table_collection_canonicalise(&other_copy, 0); if (ret != 0) { goto out; } @@ -9550,15 +9776,6 @@ tsk_table_collection_union(tsk_table_collection_t *self, tsk_edge_table_get_row_unsafe(&other->edges, k, &edge); if ((other_node_mapping[edge.parent] == TSK_NULL) || (other_node_mapping[edge.child] == TSK_NULL)) { - /* TODO: union does not support case where non-shared bits of - * other are above the shared bits of self and other. This will be - * resolved when the Mutation Table has a time attribute and - * the Mutation Table is sorted on time. */ - if (other_node_mapping[edge.parent] == TSK_NULL - && other_node_mapping[edge.child] != TSK_NULL) { - ret = TSK_ERR_UNION_NOT_SUPPORTED; - goto out; - } new_parent = node_map[edge.parent]; new_child = node_map[edge.child]; ret = tsk_edge_table_add_row(&self->edges, edge.left, edge.right, new_parent, @@ -9620,6 +9837,13 @@ tsk_table_collection_union(tsk_table_collection_t *self, goto out; } + // need to sort again since after deduplicating sites, mutations + // may not be sorted by time within sites + ret = tsk_table_collection_sort(self, 0, 0); + if (ret < 0) { + goto out; + } + ret = tsk_table_collection_build_index(self, 0); if (ret < 0) { goto out; diff --git a/c/tskit/tables.h b/c/tskit/tables.h index fc2c456724..cdb0964ede 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -635,6 +635,8 @@ typedef struct _tsk_table_sorter_t { tsk_table_collection_t *tables; /** @brief The edge sorting function. If set to NULL, edges are not sorted. */ int (*sort_edges)(struct _tsk_table_sorter_t *self, tsk_size_t start); + /** @brief The mutation sorting function. */ + int (*sort_mutations)(struct _tsk_table_sorter_t *self); /** @brief An opaque pointer for use by client code */ void *user_data; /** @brief Mapping from input site IDs to output site IDs */ @@ -698,7 +700,7 @@ typedef struct { /**@} */ -/* Flags for simplify() */ +/* Flags for simplify() and/or subset() */ #define TSK_FILTER_SITES (1 << 0) #define TSK_FILTER_POPULATIONS (1 << 1) #define TSK_FILTER_INDIVIDUALS (1 << 2) @@ -706,6 +708,7 @@ typedef struct { #define TSK_KEEP_UNARY (1 << 4) #define TSK_KEEP_INPUT_ROOTS (1 << 5) #define TSK_KEEP_UNARY_IN_INDIVIDUALS (1 << 6) +#define TSK_CANONICALISE (1 << 7) /* Flags for check_integrity */ #define TSK_CHECK_EDGE_ORDERING (1 << 0) @@ -740,7 +743,7 @@ typedef struct { #define TSK_CMP_IGNORE_METADATA (1 << 2) #define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) -/* Flags for tables collection clear */ +/* Flags for table collection clear */ #define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) #define TSK_CLEAR_TS_METADATA_AND_SCHEMA (1 << 1) #define TSK_CLEAR_PROVENANCE (1 << 2) @@ -2650,6 +2653,22 @@ TSK_NO_CHECK_INTEGRITY int tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); +/** +@brief Puts the tables into canonical order. + +@rst +This method puts tables in canonical order, which exceeds the usual +sortedness requirements in such a way that randomly reshuffled tables are +guaranteed to always be sorted to the same order (with the exception of +individuals or populations that are not referenced by any nodes). In +particular, this method puts mutation parents before their children. + +@endrst + +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options); + /** @brief Simplify the tables to remove redundant information. @@ -2732,19 +2751,33 @@ they appear in the ``nodes`` argument. Specifically, this subsets and reorders each of the tables as follows: 1. Nodes: if in the list of nodes, and in the order provided. -2. Individuals and Populations: if referred to by a retained node, - and in the order first seen when traversing the list of retained nodes. -3. Edges: if both parent and child are retained nodes. -4. Mutations: if the mutation's node is a retained node. -5. Sites: if any mutations remain at the site after removing mutations. +2. Individuals, if `TSK_FILTER_INDIVIDUALS`: if referred to by a retained node, + and in the order first seen when traversing the list of retained nodes. +3. Populations, if `TSK_FILTER_POPULATIONS`: the same as Individuals. +4. Edges: if both parent and child are retained nodes. +5. Mutations: if the mutation's node is a retained node. +6. Sites, if `TSK_FILTER_SITES`: if any mutations remain at the site after + removing mutations. Retained edges, mutations, and sites appear in the same -order as in the original tables. +order as in the original tables. If any of `TSK_FILTER_INDIVIDUALS`, +`TSK_FILTER_POPULATIONS`, or `TSK_FILTER_SITES` are *not* provided, +then the respective tables will be *unchanged*. If ``nodes`` is the entire list of nodes in the tables, then the resulting tables will be identical to the original tables, but with nodes (and individuals and populations) reordered. +An additional option, `TSK_CANONICALISE`, will reorder both individual and +population tables, as described above if `TSK_FILTER_INDIVIDUALS` and +`TSK_FILTER_POPULATIONS` were provided, but will retain any unreferenced +individuals and populations at the end of the individual and population tables. +It will also keep unreferenced sites (as if `TSK_FILTER_SITES` was not +provided). This is useful so that this can be used to reorder the tables +without losing information. + +This function does *not* require the tables to be sorted. + .. note:: Migrations are currently not supported by susbset, and an error will be raised if we attempt call subset on a table collection with greater than zero migrations. @@ -2753,10 +2786,11 @@ nodes (and individuals and populations) reordered. @param self A pointer to a tsk_table_collection_t object. @param nodes An array of num_nodes valid node IDs. @param num_nodes The number of node IDs in the input nodes array. +@param options Bitwise option flags. @return Return 0 on success or a negative value on failure. */ -int tsk_table_collection_subset( - tsk_table_collection_t *self, const tsk_id_t *nodes, tsk_size_t num_nodes); +int tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, + tsk_size_t num_nodes, tsk_flags_t options); /** @brief Forms the node-wise union of two table collections. @@ -3039,7 +3073,7 @@ Runs the sorting process: bounds of the edge table. 3. Sort the site table, building the mapping between site IDs in the current and sorted tables. -4. Sort the mutation table. +4. Sort the mutation table, using the ``sort_mutations`` pointer. If an error occurs during the execution of a user-supplied sorting function a non-zero value must be returned. This value diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 25d099e01f..273a05b820 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -21,6 +21,13 @@ - ``default`` in metadata schemas is used to fill in missing values when encoding for the struct codec. (:user:`benjeffery`, :issue:`1073`, :pr:`1116`). +- Added ``canonical`` option to table collection sorting (:user:`mufernando`, + :user:`petrelharp`, :issue:`705`) + +- Added various arguments to ``TreeSequence.subset``, to allow for stable + population indexing and lossless node reordering with subset. + (:user:`petrelharp`, :pr:`1097`) + -------------------- [0.3.4] - 2020-12-02 -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 4b7339300d..f0b244954a 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -5083,19 +5083,27 @@ TableCollection_link_ancestors(TableCollection *self, PyObject *args, PyObject * } static PyObject * -TableCollection_subset(TableCollection *self, PyObject *args) +TableCollection_subset(TableCollection *self, PyObject *args, PyObject *kwds) { int err; PyObject *ret = NULL; PyObject *nodes = NULL; PyArrayObject *nodes_array = NULL; npy_intp *shape; + tsk_flags_t options = 0; + int filter_populations = true; + int filter_individuals = true; + int filter_sites = true; + int canonicalise = false; size_t num_nodes; + static char *kwlist[] = { "nodes", "filter_populations", "filter_individuals", + "filter_sites", "canonicalise", NULL }; if (TableCollection_check_state(self) != 0) { goto out; } - if (!PyArg_ParseTuple(args, "O", &nodes)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiii", kwlist, &nodes, + &filter_populations, &filter_individuals, &filter_sites, &canonicalise)) { goto out; } nodes_array @@ -5105,9 +5113,21 @@ TableCollection_subset(TableCollection *self, PyObject *args) } shape = PyArray_DIMS(nodes_array); num_nodes = shape[0]; + if (filter_populations) { + options |= TSK_FILTER_POPULATIONS; + } + if (filter_individuals) { + options |= TSK_FILTER_INDIVIDUALS; + } + if (filter_sites) { + options |= TSK_FILTER_SITES; + } + if (canonicalise) { + options |= TSK_CANONICALISE; + } err = tsk_table_collection_subset( - self->tables, PyArray_DATA(nodes_array), num_nodes); + self->tables, PyArray_DATA(nodes_array), num_nodes, options); if (err != 0) { handle_library_error(err); goto out; @@ -5332,6 +5352,7 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) } memset(&start, 0, sizeof(start)); start.edges = edge_start; + err = tsk_table_collection_sort(self->tables, &start, 0); if (err != 0) { handle_library_error(err); @@ -5342,6 +5363,25 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) return ret; } +static PyObject * +TableCollection_canonicalise(TableCollection *self) +{ + int err; + PyObject *ret = NULL; + + if (TableCollection_check_state(self) != 0) { + goto out; + } + err = tsk_table_collection_canonicalise(self->tables, 0); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + return ret; +} + static PyObject * TableCollection_compute_mutation_parents(TableCollection *self) { @@ -5732,7 +5772,7 @@ static PyMethodDef TableCollection_methods[] = { = "Returns an edge table linking samples to a set of specified ancestors." }, { .ml_name = "subset", .ml_meth = (PyCFunction) TableCollection_subset, - .ml_flags = METH_VARARGS, + .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Subsets the table collection to a set of nodes." }, { .ml_name = "union", .ml_meth = (PyCFunction) TableCollection_union, @@ -5748,6 +5788,10 @@ static PyMethodDef TableCollection_methods[] = { .ml_meth = (PyCFunction) TableCollection_sort, .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Sorts the tables to satisfy tree sequence requirements." }, + { .ml_name = "canonicalise", + .ml_meth = (PyCFunction) TableCollection_canonicalise, + .ml_flags = METH_NOARGS, + .ml_doc = "Puts the tables in canonical order." }, { .ml_name = "equals", .ml_meth = (PyCFunction) TableCollection_equals, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 46999e0154..6672fed907 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1389,63 +1389,91 @@ def test_index(self): class TestSortTables: """ - Tests for the TableCollection.sort() method. + Tests for the TableCollection.sort() and TableCollection.canonicalise() methods. """ random_seed = 12345 - def verify_randomise_tables(self, ts): - tables = ts.dump_tables() - tables.compute_mutation_times() - ts = tables.tree_sequence() - - # Randomise the tables - random.seed(self.random_seed) - randomised_edges = list(ts.edges()) - random.shuffle(randomised_edges) - tables.edges.clear() - for e in randomised_edges: - tables.edges.add_row(e.left, e.right, e.parent, e.child) - # Verify that import fails for randomised edges - with pytest.raises(_tskit.LibraryError): - tables.tree_sequence() + def verify_sort_equality(self, tables, seed): + tables1 = tables.copy() + tsutil.shuffle_tables( + tables1, + seed, + shuffle_individuals=False, + shuffle_populations=False, + ) + tables2 = tables1.copy() + tables1.sort() + tsutil.py_sort(tables2) + tsutil.assert_table_collections_equal(tables1, tables2) + + def verify_canonical_equality(self, tables, seed): + tables1 = tables.copy() + tsutil.shuffle_tables( + tables1, + seed, + ) + tables2 = tables1.copy() + tables1.canonicalise() + tsutil.py_sort(tables2, canonical=True) + tsutil.assert_table_collections_equal(tables1, tables2) + + def verify_sort_mutation_consistency(self, orig_tables, seed): + tables = orig_tables.copy() + mut_map = {s.position: [] for s in tables.sites} + for mut in tables.mutations: + mut_map[tables.sites[mut.site].position].append( + (mut.node, mut.derived_state, mut.metadata) + ) + tsutil.shuffle_tables( + tables, seed, shuffle_individuals=False, shuffle_populations=False + ) + for mut in tables.mutations: + site = tables.sites[mut.site] + assert (mut.node, mut.derived_state, mut.metadata) in mut_map[site.position] tables.sort() - assert tables == ts.dump_tables() + for mut in tables.mutations: + site = tables.sites[mut.site] + assert (mut.node, mut.derived_state, mut.metadata) in mut_map[site.position] - tables.sites.clear() - tables.mutations.clear() - randomised_sites = list(ts.sites()) - random.shuffle(randomised_sites) - # Maps original IDs into their indexes in the randomised table. - site_id_map = {} - randomised_mutations = [] - for s in randomised_sites: - site_id_map[s.id] = tables.sites.add_row( - s.position, ancestral_state=s.ancestral_state, metadata=s.metadata - ) - randomised_mutations.extend(s.mutations) - random.shuffle(randomised_mutations) - for m in randomised_mutations: - tables.mutations.add_row( - site=site_id_map[m.site], - node=m.node, - derived_state=m.derived_state, - parent=m.parent, - metadata=m.metadata, - time=m.time, - ) - if ts.num_sites > 1: - # Verify that import fails for randomised sites - with pytest.raises(_tskit.LibraryError): - tables.tree_sequence() + def verify_randomise_tables(self, orig_tables, seed): + # Check we can shuffle everything and then put it back in canonical form + tables = orig_tables.copy() tables.sort() - assert tables == ts.dump_tables() + sorted_tables = tables.copy() + + # First randomize only edges: this should work without canonical sorting. + tsutil.shuffle_tables( + tables, + seed=seed, + shuffle_edges=True, + shuffle_populations=False, + shuffle_individuals=False, + shuffle_sites=False, + shuffle_mutations=False, + ) + tables.sort() + tsutil.assert_table_collections_equal(tables, sorted_tables) + + # Now also randomize sites and mutations + tables.canonicalise() + sorted_tables = tables.copy() + tsutil.shuffle_tables( + tables, seed=1234, shuffle_populations=False, shuffle_individuals=False + ) + tables.canonicalise() + tsutil.assert_table_collections_equal(tables, sorted_tables) - ts_new = tables.tree_sequence() - assert ts_new.num_edges == ts.num_edges - assert ts_new.num_trees == ts.num_trees - assert ts_new.num_sites == ts.num_sites - assert ts_new.num_mutations == ts.num_mutations + # Finally, randomize everything else + tsutil.shuffle_tables(tables, seed=1234) + tables.canonicalise() + tsutil.assert_table_collections_equal(tables, sorted_tables) + + def verify_sort(self, tables, seed): + self.verify_sort_equality(tables, seed) + self.verify_canonical_equality(tables, seed) + self.verify_sort_mutation_consistency(tables, seed) + self.verify_randomise_tables(tables, seed) def verify_edge_sort_offset(self, ts): """ @@ -1481,44 +1509,50 @@ def verify_edge_sort_offset(self, ts): # Verify the new and old edges are equal. assert edges == tables.edges + def get_wf_example(self, seed): + tables = wf.wf_sim(6, 3, num_pops=2, seed=seed, num_loci=3) + tables.sort() + ts = tables.tree_sequence() + return tsutil.insert_individuals(ts, ploidy=2) + def test_single_tree_no_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 432) def test_single_tree_no_mutations_metadata(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 12) def test_many_trees_no_mutations(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) assert ts.num_trees > 2 - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 31) def test_single_tree_mutations(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed) assert ts.num_sites > 2 - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 83) def test_single_tree_mutations_metadata(self): ts = msprime.simulate(10, mutation_rate=2, random_seed=self.random_seed) assert ts.num_sites > 2 ts = tsutil.add_random_metadata(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 384) def test_single_tree_multichar_mutations(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.insert_multichar_mutations(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 185) def test_single_tree_multichar_mutations_metadata(self): ts = msprime.simulate(10, random_seed=self.random_seed) ts = tsutil.insert_multichar_mutations(ts, self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 2175) def test_many_trees_mutations(self): ts = msprime.simulate( @@ -1526,21 +1560,21 @@ def test_many_trees_mutations(self): ) assert ts.num_trees > 2 assert ts.num_sites > 2 - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 173) def test_many_trees_multichar_mutations(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) assert ts.num_trees > 2 ts = tsutil.insert_multichar_mutations(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 16) def test_many_trees_multichar_mutations_metadata(self): ts = msprime.simulate(10, recombination_rate=2, random_seed=self.random_seed) assert ts.num_trees > 2 ts = tsutil.insert_multichar_mutations(ts, self.random_seed) ts = tsutil.add_random_metadata(ts, self.random_seed) - self.verify_randomise_tables(ts) + self.verify_sort(ts.tables, 91) def get_nonbinary_example(self, mutation_rate): ts = msprime.simulate( @@ -1559,20 +1593,52 @@ def get_nonbinary_example(self, mutation_rate): found = True break assert found + assert ts.num_trees > 2 return ts def test_nonbinary_trees(self): ts = self.get_nonbinary_example(mutation_rate=0) - assert ts.num_trees > 2 - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 9182) def test_nonbinary_trees_mutations(self): ts = self.get_nonbinary_example(mutation_rate=2) assert ts.num_trees > 2 assert ts.num_sites > 2 - self.verify_randomise_tables(ts) self.verify_edge_sort_offset(ts) + self.verify_sort(ts.tables, 44) + + def test_unknown_times(self): + ts = self.get_wf_example(seed=486) + ts = tsutil.insert_branch_mutations(ts, mutations_per_branch=2) + ts = tsutil.remove_mutation_times(ts) + self.verify_sort(ts.tables, 9182) + + def test_no_mutation_parents(self): + # we should maintain relative order of mutations when all else fails: + # these tables are not canonicalizable (since we don't sort on derived state) + rng = random.Random(7000) + alleles = ["A", "B", "C", "D", "E", "F", "G"] + for t in [0.5, None]: + rng.shuffle(alleles) + tables = tskit.TableCollection(sequence_length=1) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.sites.add_row(position=0, ancestral_state="") + for a in alleles: + tables.mutations.add_row(site=0, node=0, derived_state=a, time=t) + tables_canonical = tables.copy() + tables_canonical.canonicalise() + tables.sort() + for t in (tables, tables_canonical): + for a, mut in zip(alleles, t.mutations): + assert a == mut.derived_state + self.verify_sort_equality(t, 985) + self.verify_sort_mutation_consistency(t, 985) + + def test_discrete_times(self): + ts = self.get_wf_example(seed=623) + ts = tsutil.insert_discrete_time_mutations(ts) + self.verify_sort(ts.tables, 9183) def test_incompatible_edges(self): ts1 = msprime.simulate(10, random_seed=self.random_seed) @@ -1583,6 +1649,8 @@ def test_incompatible_edges(self): # The edges in tables2 will refer to nodes that don't exist. with pytest.raises(_tskit.LibraryError): tables2.sort() + with pytest.raises(_tskit.LibraryError): + tables2.canonicalise() def test_incompatible_sites(self): ts1 = msprime.simulate(10, random_seed=self.random_seed) @@ -1594,6 +1662,8 @@ def test_incompatible_sites(self): tables1.mutations.set_columns(**tables2.mutations.asdict()) with pytest.raises(_tskit.LibraryError): tables1.sort() + with pytest.raises(_tskit.LibraryError): + tables1.canonicalise() def test_incompatible_mutation_nodes(self): ts1 = msprime.simulate(2, random_seed=self.random_seed) @@ -1602,11 +1672,12 @@ def test_incompatible_mutation_nodes(self): tables1 = ts1.dump_tables() tables2 = ts2.dump_tables() # The mutations in tables2 will refer to nodes that don't exist. - # print(tables2.sites.asdict()) tables1.sites.set_columns(**tables2.sites.asdict()) tables1.mutations.set_columns(**tables2.mutations.asdict()) with pytest.raises(_tskit.LibraryError): tables1.sort() + with pytest.raises(_tskit.LibraryError): + tables1.canonicalise() def test_empty_tables(self): tables = tskit.TableCollection(1) @@ -3150,7 +3221,7 @@ def get_msprime_example(self, sample_size=10, seed=1234): return ts.tables def get_wf_example(self, N=5, ngens=2, seed=1249): - tables = wf.wf_sim(N, N, seed=seed) + tables = wf.wf_sim(N, N, num_pops=2, seed=seed) tables.sort() ts = tables.tree_sequence() # adding muts @@ -3164,11 +3235,38 @@ def get_examples(self, seed): yield self.get_wf_example(seed=seed) def verify_subset_equality(self, tables, nodes): - sub1 = tables.copy() - sub2 = tables.copy() - tsutil.py_subset(sub1, nodes, record_provenance=False) - sub2.subset(nodes, record_provenance=False) - assert sub1 == sub2 + for kp, ki, ks, cn in [ + [False, False, False, False], + [True, False, False, False], + [False, True, False, False], + [False, False, True, False], + [True, True, False, False], + [True, False, True, False], + [False, True, True, False], + [True, True, True, False], + [False, False, False, True], + [True, False, False, True], + ]: + sub1 = tables.copy() + sub2 = tables.copy() + tsutil.py_subset( + sub1, + nodes, + record_provenance=False, + filter_populations=kp, + filter_individuals=ki, + filter_sites=ks, + canonicalise=cn, + ) + sub2.subset( + nodes, + record_provenance=False, + filter_populations=kp, + filter_individuals=ki, + filter_sites=ks, + canonicalise=cn, + ) + tsutil.assert_table_collections_equal(sub1, sub2) def verify_subset(self, tables, nodes): self.verify_subset_equality(tables, nodes) @@ -3254,19 +3352,48 @@ def test_ts_subset(self): assert tables == tables2 def test_subset_all(self): - # subsetting to everything shouldn't change things - # except the individual ids in the node tables if - # there are gaps + # subsetting to everything shouldn't change things except the + # individual and population ids in the node tables if there are gaps for tables in self.get_examples(123583): tables2 = tables.copy() tables2.subset(np.arange(tables.nodes.num_rows)) - tables.provenances.clear() - tables2.provenances.clear() tables.individuals.clear() tables2.individuals.clear() + assert np.all(tables.nodes.time == tables2.nodes.time) + assert np.all(tables.nodes.flags == tables2.nodes.flags) + assert np.all(tables.nodes.population == tables2.nodes.population) + assert np.all(tables.nodes.metadata == tables2.nodes.metadata) tables.nodes.clear() tables2.nodes.clear() - assert tables == tables2 + tsutil.assert_table_collections_equal( + tables, tables2, ignore_provenance=True + ) + + def test_shuffled_tables(self): + # subset should work on even unsorted tables + # (tested more thoroughly in TestSortTables) + for tables in self.get_examples(95521): + tables2 = tables.copy() + tsutil.shuffle_tables(tables2, 7000) + tables2.subset( + np.arange(tables.nodes.num_rows), + canonicalise=True, + ) + assert tables.nodes.num_rows == tables2.nodes.num_rows + assert tables.individuals.num_rows == tables2.individuals.num_rows + assert tables.populations.num_rows == tables2.populations.num_rows + assert tables.edges.num_rows == tables2.edges.num_rows + assert tables.sites.num_rows == tables2.sites.num_rows + assert tables.mutations.num_rows == tables2.mutations.num_rows + tables2 = tables.copy() + tsutil.shuffle_tables(tables2, 7001) + tables2.subset([]) + assert tables2.nodes.num_rows == 0 + assert tables2.individuals.num_rows == 0 + assert tables2.populations.num_rows == 0 + assert tables2.edges.num_rows == 0 + assert tables2.sites.num_rows == 0 + assert tables2.mutations.num_rows == 0 def test_random_subsets(self): rng = np.random.default_rng(1542) @@ -3288,13 +3415,55 @@ def test_empty_nodes(self): assert subset.mutations.num_rows == 0 assert subset.provenances == tables.provenances + def test_no_filter_sites(self): + tables = tskit.TableCollection(sequence_length=10) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=1) + tables.edges.add_row(parent=1, child=0, left=0, right=10) + for k in range(10): + tables.sites.add_row(position=k, ancestral_state=str(k)) + sub_tables = tables.copy() + sub_tables.subset([], filter_sites=False) + assert tables.sites == sub_tables.sites + ts = tables.tree_sequence() + sub_tables = ts.subset([], filter_sites=False).tables + assert tables.sites == sub_tables.sites + + def test_no_filter_populations(self): + tables = tskit.TableCollection(sequence_length=10) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=1) + tables.edges.add_row(parent=1, child=0, left=0, right=10) + for k in range(10): + tables.populations.add_row(metadata=str(k).encode()) + sub_tables = tables.copy() + sub_tables.subset([], filter_populations=False) + assert tables.populations == sub_tables.populations + ts = tables.tree_sequence() + sub_tables = ts.subset([], filter_populations=False).tables + assert tables.populations == sub_tables.populations + + def test_no_filter_individuals(self): + tables = tskit.TableCollection(sequence_length=10) + tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) + tables.nodes.add_row(time=1) + tables.edges.add_row(parent=1, child=0, left=0, right=10) + for k in range(10): + tables.individuals.add_row(metadata=str(k).encode()) + sub_tables = tables.copy() + sub_tables.subset([], filter_individuals=False) + assert tables.individuals == sub_tables.individuals + ts = tables.tree_sequence() + sub_tables = ts.subset([], filter_individuals=False).tables + assert tables.individuals == sub_tables.individuals + -class TestUnion(unittest.TestCase): +class TestUnionTables(unittest.TestCase): """ Tests for the TableCollection.union method """ - def get_msprime_example(self, sample_size=3, T=5, seed=1239): + def get_msprime_example(self, sample_size, T, seed): # we assume after the split the ts are completely independent M = [[0, 0], [0, 0]] population_configurations = [ @@ -3319,19 +3488,18 @@ def get_msprime_example(self, sample_size=3, T=5, seed=1239): ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=1) return ts - def get_wf_example(self, N=5, T=5, seed=1249): + def get_wf_example(self, N, T, seed): twopop_tables = wf.wf_sim(N, T, num_pops=2, seed=seed, deep_history=True) twopop_tables.sort() ts = twopop_tables.tree_sequence() ts = ts.simplify() - # adding muts ts = tsutil.jukes_cantor(ts, 1, 10, seed=seed) ts = tsutil.add_random_metadata(ts, seed) ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=2) return ts def split_example(self, ts, T): - # splitting two pop ts into disjoint ts + # splitting two pop ts *with no migration* into disjoint ts shared_nodes = [n.id for n in ts.nodes() if n.time >= T] pop1 = list(ts.samples(population=0)) pop2 = list(ts.samples(population=1)) @@ -3366,13 +3534,21 @@ def verify_union_equality(self, tables, other, node_mapping, add_populations=Tru record_provenance=False, add_populations=add_populations, ) - assert uni1.equals(uni2, ignore_ts_metadata=True, ignore_provenance=True) + tsutil.assert_table_collections_equal(uni1, uni2, ignore_provenance=True) # verifying that subsetting to original nodes return the same table orig_nodes = [j for i, j in enumerate(node_mapping) if j != tskit.NULL] uni1.subset(orig_nodes) # subsetting tables just to make sure order is the same tables.subset(orig_nodes) - assert uni1.equals(tables, ignore_ts_metadata=True, ignore_provenance=True) + tsutil.assert_table_collections_equal(uni1, tables, ignore_provenance=True) + + def test_union_empty(self): + ts1 = self.get_msprime_example(sample_size=3, T=2, seed=9328) + ts2 = tskit.TableCollection(sequence_length=ts1.sequence_length).tree_sequence() + uni = ts1.union(ts2, []) + tsutil.assert_table_collections_equal( + ts1.tables, uni.tables, ignore_provenance=True + ) def test_noshared_example(self): ts1 = self.get_msprime_example(sample_size=3, T=2, seed=9328) @@ -3392,16 +3568,17 @@ def test_all_shared_example(self): def test_no_add_pop(self): self.verify_union_equality( - *self.split_example(self.get_msprime_example(10, 10), 10), + *self.split_example(self.get_msprime_example(10, 10, seed=135), 10), add_populations=False, ) self.verify_union_equality( - *self.split_example(self.get_wf_example(10, 10), 10), add_populations=False + *self.split_example(self.get_wf_example(10, 10, seed=157), 10), + add_populations=False, ) def test_provenance(self): tables, other, node_mapping = self.split_example( - self.get_msprime_example(5, 2, seed=928), 2 + self.get_msprime_example(5, T=2, seed=928), 2 ) tables_copy = tables.copy() tables.union(other, node_mapping) @@ -3421,10 +3598,93 @@ def test_provenance(self): def test_examples(self): for N in [2, 4, 5]: for T in [2, 5, 20]: - with self.subTest(N=N, T=T): - self.verify_union_equality( - *self.split_example(self.get_msprime_example(N, T), T) - ) - self.verify_union_equality( - *self.split_example(self.get_wf_example(N, T), T) - ) + for mut_times in [True, False]: + with self.subTest(N=N, T=T): + ts = self.get_msprime_example(N, T=T, seed=888) + if mut_times: + tables = ts.tables + tables.compute_mutation_times() + ts = tables.tree_sequence() + self.verify_union_equality(*self.split_example(ts, T)) + ts = self.get_wf_example(N=N, T=T, seed=827) + if mut_times: + tables = ts.tables + tables.compute_mutation_times() + ts = tables.tree_sequence() + self.verify_union_equality(*self.split_example(ts, T)) + + +class TestSubsetUnion(unittest.TestCase): + # Check that we can remove a single individual from a tree sequence and + # then use union to put it back in again. This is a good test of union, + # subset, and various sort options. + + def verify_subset_union(self, ts): + tables = ts.tables + # remove unused individuals and populations since if there's more than + # one of these then it can't be canonically sorted below + tables.subset(np.arange(tables.nodes.num_rows), filter_sites=False) + has_unknown_times = np.any(tskit.is_unknown_time(tables.mutations.time)) + if has_unknown_times: + tsutil.shuffle_tables(tables, seed=2545, keep_mutation_parent_order=True) + else: + tsutil.shuffle_tables(tables, seed=2545, keep_mutation_parent_order=False) + rng = random.Random(2345) + target = np.array([rng.choice(list(range(tables.nodes.num_rows)))]) + if ts.node(target[0]).individual != tskit.NULL: + ind = ts.individual(ts.node(target[0]).individual) + target = ind.nodes + target_relatives = np.concatenate( + [ + tables.edges.parent[np.isin(tables.edges.child, target)], + tables.edges.child[np.isin(tables.edges.parent, target)], + ] + ) + not_target = [a for a in range(tables.nodes.num_rows) if a not in target] + # relationships to target + target_plus = np.concatenate([target, target_relatives]) + sub1 = tables.copy() + sub1.subset(target_plus, filter_populations=False, filter_sites=False) + # everything else + new_tables = tables.copy() + new_tables.subset(not_target, filter_populations=False, filter_sites=False) + # and union back together + mapping12 = [tskit.NULL for _ in target] + [ + not_target.index(n) for n in target_relatives + ] + # union needs tables to be sorted + new_tables.sort() + sub1.sort() + # double-check we're doing the mapping right + new_tables.union(sub1, node_mapping=mapping12, add_populations=False) + reorder = np.arange(new_tables.nodes.num_rows) + for j, t in enumerate(target): + reorder[t:] -= 1 + reorder[t] = new_tables.nodes.num_rows - len(target) + j + new_tables.subset(reorder, filter_sites=False) + new_tables.canonicalise() + tables.canonicalise() + tsutil.assert_table_collections_equal( + new_tables, tables, ignore_provenance=True + ) + + def get_wf_example(self, N, T, seed): + twopop_tables = wf.wf_sim(N, T, num_pops=2, seed=seed, deep_history=True) + twopop_tables.sort() + ts = twopop_tables.tree_sequence() + ts = ts.simplify() + ts = tsutil.jukes_cantor(ts, 1, 10, seed=seed) + ts = tsutil.add_random_metadata(ts, seed) + ts = tsutil.insert_random_ploidy_individuals(ts, max_ploidy=2) + return ts + + def test_no_mutation_times(self): + ts = self.get_wf_example(10, 4, seed=925) + self.verify_subset_union(ts) + + def test_with_mutation_times(self): + ts = self.get_wf_example(10, 4, seed=61) + tables = ts.tables + tables.compute_mutation_times() + ts = tables.tree_sequence() + self.verify_subset_union(ts) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 7c74f5c1d1..94e44871ad 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -24,6 +24,7 @@ A collection of utilities to edit and construct tree sequences. """ import collections +import functools import json import random import string @@ -135,6 +136,41 @@ def insert_branch_mutations(ts, mutations_per_branch=1): return tables.tree_sequence() +def remove_mutation_times(ts): + tables = ts.tables + tables.mutations.time = np.full_like(tables.mutations.time, tskit.UNKNOWN_TIME) + return tables.tree_sequence() + + +def insert_discrete_time_mutations(ts, num_times=4, num_sites=10): + """ + Inserts mutations in the tree sequence at regularly-spaced num_sites + positions, at only a discrete set of times (the same for all trees): at + num_times times evenly spaced between 0 and the maximum tree height. + """ + tables = ts.tables + tables.sites.clear() + tables.mutations.clear() + height = max([t.time(t.roots[0]) for t in ts.trees()]) + for j, pos in enumerate(np.linspace(0, tables.sequence_length, num_sites + 1)[:-1]): + anc = "X" * j + tables.sites.add_row(position=pos, ancestral_state=anc) + t = ts.at(pos) + for k, s in enumerate(np.linspace(0, height, num_times)): + for n in t.nodes(): + if t.time(n) <= s and ( + (t.parent(n) == tskit.NULL) or (t.time(t.parent(n)) > s) + ): + tables.mutations.add_row( + site=j, node=n, derived_state=anc + str(k), time=s + ) + k += 1 + tables.sort() + tables.build_index() + tables.compute_mutation_parents() + return tables.tree_sequence() + + def insert_branch_sites(ts): """ Returns a copy of the specified tree sequence with a site on every branch @@ -576,7 +612,15 @@ def compute_mutation_parent(ts): return mutation_parent -def py_subset(tables, nodes, record_provenance=True): +def py_subset( + tables, + nodes, + record_provenance=True, + filter_populations=True, + filter_individuals=True, + filter_sites=True, + canonicalise=False, +): """ Naive implementation of the TableCollection.subset method using the Python API. """ @@ -588,6 +632,16 @@ def py_subset(tables, nodes, record_provenance=True): node_map = {} ind_map = {tskit.NULL: tskit.NULL} pop_map = {tskit.NULL: tskit.NULL} + if not filter_populations and not canonicalise: + for j, pop in enumerate(full.populations): + pop_map[j] = j + tables.populations.add_row(metadata=pop.metadata) + if not filter_individuals and not canonicalise: + for j, ind in enumerate(full.individuals): + ind_map[j] = j + tables.individuals.add_row( + flags=ind.flags, location=ind.location, metadata=ind.metadata + ) for old_id in nodes: node = full.nodes[old_id] if node.individual not in ind_map and node.individual != tskit.NULL: @@ -608,6 +662,15 @@ def py_subset(tables, nodes, record_provenance=True): node.metadata, ) node_map[old_id] = new_id + if canonicalise: + for j, ind in enumerate(full.individuals): + if j not in ind_map: + ind_map[j] = tables.individuals.add_row( + ind.flags, ind.location, ind.metadata + ) + for j, ind in enumerate(full.populations): + if j not in pop_map: + pop_map[j] = tables.populations.add_row(ind.metadata) for edge in full.edges: if edge.child in nodes and edge.parent in nodes: tables.edges.add_row( @@ -620,6 +683,11 @@ def py_subset(tables, nodes, record_provenance=True): if full.migrations.num_rows > 0: raise ValueError("Migrations are currently not supported in this operation.") site_map = {} + if canonicalise or not filter_sites: + for j, site in enumerate(full.sites): + site_map[j] = tables.sites.add_row( + site.position, site.ancestral_state, site.metadata + ) mutation_map = {tskit.NULL: tskit.NULL} for i, mut in enumerate(full.mutations): if mut.node in nodes: @@ -680,9 +748,6 @@ def py_union(tables, other, nodes, record_provenance=True, add_populations=True) node_map[other_id] = node_id for edge in other.edges: if (nodes[edge.parent] == tskit.NULL) or (nodes[edge.child] == tskit.NULL): - # can't do this right not because of sorting of mutations - if (nodes[edge.parent] == tskit.NULL) and (nodes[edge.child] != tskit.NULL): - raise ValueError("Cannot graft nodes above existing nodes.") tables.edges.add_row( left=edge.left, right=edge.right, @@ -708,6 +773,7 @@ def py_union(tables, other, nodes, record_provenance=True, add_populations=True) parent=tskit.NULL, time=mut.time, metadata=mut.metadata, + time=mut.time, ) mut_map[other_id] = mut_id # migration table @@ -717,6 +783,9 @@ def py_union(tables, other, nodes, record_provenance=True, add_populations=True) # sorting, deduplicating sites, and re-computing mutation parents tables.sort() tables.deduplicate_sites() + # need to sort again since after deduplicating sites, mutations may not be + # sorted by time within sites + tables.sort() tables.build_index() tables.compute_mutation_parents() @@ -764,6 +833,236 @@ def compute_mutation_times(ts): return tables.mutations.time +def shuffle_tables( + tables, + seed, + shuffle_edges=True, + shuffle_populations=True, + shuffle_individuals=True, + shuffle_sites=True, + shuffle_mutations=True, + shuffle_migrations=True, + keep_mutation_parent_order=False, +): + """ + Randomizes the order of rows in (possibly) all except the Node table. Note + that if mutations are completely shuffled, then TableCollection.sort() will + not necessarily produce valid tables (unless all mutation times are present + and distinct), since it only puts parent mutations before children if + canonical=True. However, setting keep_mutation_parent_order to True will + maintain the order of mutations within each site. + + :param TableCollection tables: The table collection that is shuffled (in place). + """ + rng = random.Random(seed) + orig = tables.copy() + tables.nodes.clear() + tables.individuals.clear() + tables.populations.clear() + tables.edges.clear() + tables.sites.clear() + tables.mutations.clear() + tables.drop_index() + # populations + randomised_pops = list(enumerate(orig.populations)) + if shuffle_populations: + rng.shuffle(randomised_pops) + pop_id_map = {tskit.NULL: tskit.NULL} + for j, p in randomised_pops: + pop_id_map[j] = tables.populations.add_row(metadata=p.metadata) + # individuals + randomised_inds = list(enumerate(orig.individuals)) + if shuffle_individuals: + rng.shuffle(randomised_inds) + ind_id_map = {tskit.NULL: tskit.NULL} + for j, i in randomised_inds: + ind_id_map[j] = tables.individuals.add_row( + flags=i.flags, location=i.location, metadata=i.metadata + ) + # nodes (same order, but remapped populations and individuals) + for n in orig.nodes: + tables.nodes.add_row( + flags=n.flags, + time=n.time, + population=pop_id_map[n.population], + individual=ind_id_map[n.individual], + metadata=n.metadata, + ) + # edges + randomised_edges = list(orig.edges) + if shuffle_edges: + rng.shuffle(randomised_edges) + for e in randomised_edges: + tables.edges.add_row(e.left, e.right, e.parent, e.child, metadata=e.metadata) + # migrations + randomised_migrations = list(orig.migrations) + if shuffle_migrations: + rng.shuffle(randomised_migrations) + for m in randomised_migrations: + tables.migrations.add_row( + m.left, + m.right, + m.node, + pop_id_map[m.source], + pop_id_map[m.dest], + m.time, + m.metadata, + ) + # sites + randomised_sites = list(enumerate(orig.sites)) + if shuffle_sites: + rng.shuffle(randomised_sites) + site_id_map = {} + for j, s in randomised_sites: + site_id_map[j] = tables.sites.add_row( + s.position, ancestral_state=s.ancestral_state, metadata=s.metadata + ) + # mutations + randomised_mutations = list(enumerate(orig.mutations)) + if shuffle_mutations: + if keep_mutation_parent_order: + # randomise *except* keeping parent mutations before children + mut_site_order = [mut.site for mut in orig.mutations] + rng.shuffle(mut_site_order) + mut_by_site = {s: [] for s in mut_site_order} + for j, m in enumerate(orig.mutations): + mut_by_site[m.site].insert(0, (j, m)) + randomised_mutations = [] + for s in mut_site_order: + randomised_mutations.append(mut_by_site[s].pop()) + else: + rng.shuffle(randomised_mutations) + mut_id_map = {tskit.NULL: tskit.NULL} + for j, (k, _) in enumerate(randomised_mutations): + mut_id_map[k] = j + for _, m in randomised_mutations: + tables.mutations.add_row( + site=site_id_map[m.site], + node=m.node, + derived_state=m.derived_state, + parent=mut_id_map[m.parent], + metadata=m.metadata, + time=m.time, + ) + if keep_mutation_parent_order: + assert np.all(tables.mutations.parent < np.arange(tables.mutations.num_rows)) + return tables + + +def orig_cmp_site(i, j, tables): + ret = tables.sites.position[i] - tables.sites.position[j] + if ret == 0: + ret = i - j + return ret + + +def orig_cmp_mutation(i, j, tables, site_order, canonical=False, num_descendants=None): + site_i = tables.mutations.site[i] + site_j = tables.mutations.site[j] + ret = site_order[site_i] - site_order[site_j] + if ( + ret == 0 + and (not tskit.is_unknown_time(tables.mutations.time[i])) + and (not tskit.is_unknown_time(tables.mutations.time[j])) + ): + ret = tables.mutations.time[j] - tables.mutations.time[i] + if canonical: + if ret == 0: + ret = num_descendants[j] - num_descendants[i] + if ret == 0: + ret = tables.mutations.node[i] - tables.mutations.node[j] + if ret == 0: + ret = i - j + return ret + + +def orig_cmp_edge(i, j, tables): + ret = ( + tables.nodes.time[tables.edges.parent[i]] + - tables.nodes.time[tables.edges.parent[j]] + ) + if ret == 0: + ret = tables.edges.parent[i] - tables.edges.parent[j] + if ret == 0: + ret = tables.edges.child[i] - tables.edges.child[j] + if ret == 0: + ret = tables.edges.left[i] - tables.edges.left[j] + return ret + + +def compute_mutation_num_descendants(tables): + mutations = tables.mutations + num_descendants = np.zeros(mutations.num_rows) + for p in mutations.parent: + while p != tskit.NULL: + num_descendants[p] += 1 + p = mutations.parent[p] + return num_descendants + + +def py_sort(tables, canonical=False): + cmp_edge = orig_cmp_edge + cmp_site = orig_cmp_site + cmp_mutation = orig_cmp_mutation + num_descendants = np.zeros(tables.mutations.num_rows) + if canonical: + # Reorder nodes, populations, and individuals. + # Note that this will keep unreferenced populations and individuals, + # but will not put these in canonical order. + tables.subset( + np.arange(tables.nodes.num_rows), + record_provenance=False, + canonicalise=True, + ) + num_descendants = compute_mutation_num_descendants(tables) + + copy = tables.copy() + tables.edges.clear() + tables.sites.clear() + tables.mutations.clear() + edge_key = functools.cmp_to_key(lambda a, b: cmp_edge(a, b, tables=copy)) + sorted_edges = sorted(range(copy.edges.num_rows), key=edge_key) + site_key = functools.cmp_to_key(lambda a, b: cmp_site(a, b, tables=copy)) + sorted_sites = sorted(range(copy.sites.num_rows), key=site_key) + site_id_map = {k: j for j, k in enumerate(sorted_sites)} + site_order = np.argsort(sorted_sites) + mut_key = functools.cmp_to_key( + lambda a, b: cmp_mutation( + a, + b, + tables=copy, + site_order=site_order, + canonical=canonical, + num_descendants=num_descendants, + ) + ) + sorted_muts = sorted(range(copy.mutations.num_rows), key=mut_key) + mut_id_map = {k: j for j, k in enumerate(sorted_muts)} + mut_id_map[tskit.NULL] = tskit.NULL + for edge_id in sorted_edges: + tables.edges.add_row( + copy.edges[edge_id].left, + copy.edges[edge_id].right, + copy.edges[edge_id].parent, + copy.edges[edge_id].child, + ) + for site_id in sorted_sites: + tables.sites.add_row( + copy.sites[site_id].position, + copy.sites[site_id].ancestral_state, + copy.sites[site_id].metadata, + ) + for mut_id in sorted_muts: + tables.mutations.add_row( + site_id_map[copy.mutations[mut_id].site], + copy.mutations[mut_id].node, + copy.mutations[mut_id].derived_state, + mut_id_map[copy.mutations[mut_id].parent], + copy.mutations[mut_id].metadata, + copy.mutations[mut_id].time, + ) + + def algorithm_T(ts): """ Simple implementation of algorithm T from the PLOS paper, taking into @@ -1293,3 +1592,46 @@ def genealogical_nearest_neighbours(ts, focal, reference_sets): L[L == 0] = 1 A /= L.reshape((len(focal), 1)) return A + + +def assert_table_collections_equal(t1, t2, ignore_provenance=False): + """ + Checks for table collection equality, but step-by-step, + so it's easy to see what's different. + """ + assert_tables_equal(t1.populations, t2.populations, "populations") + assert_tables_equal(t1.individuals, t2.individuals, "individuals") + assert_tables_equal(t1.nodes, t2.nodes, "nodes") + assert_tables_equal(t1.edges, t2.edges, "edges") + assert_tables_equal(t1.sites, t2.sites, "sites") + assert_tables_equal(t1.mutations, t2.mutations, "mutations") + assert_tables_equal(t1.migrations, t2.migrations, "migrations") + if not ignore_provenance: + assert_tables_equal(t1.provenances, t2.provenances, "provenances") + assert t1.metadata_schema == t2.metadata_schema + assert t1.metadata == t2.metadata + assert t1.metadata_bytes == t2.metadata_bytes + assert t1.sequence_length == t2.sequence_length + assert t1.equals(t2, ignore_provenance=ignore_provenance) + + +def assert_tables_equal(t1, t2, label=""): + if hasattr(t1, "metadata_schema"): + if t1.metadata_schema != t2.metadata_schema: + msg = ( + f"{label} :::::::::: t1 ::::::::::::\n{t1.metadata_schema}" + f"{label} :::::::::: t2 ::::::::::::\n{t1.metadata_schema}" + ) + raise AssertionError(msg) + for k, (e1, e2) in enumerate(zip(t1, t2)): + if e1 != e2: + msg = ( + f"{label} :::::::::: t1 (row {k}) ::::::::::::\n{e1}" + f"{label} :::::::::: t2 (row {k}) ::::::::::::\n{e2}" + ) + raise AssertionError(msg) + if t1.num_rows != t2.num_rows: + raise AssertionError( + f"{label}: t1.num_rows {t1.num_rows} != {t2.num_rows} t2.num_rows" + ) + assert t1 == t2 diff --git a/python/tskit/tables.py b/python/tskit/tables.py index df7c9fcf52..3d68615905 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2692,6 +2692,24 @@ def sort(self, edge_start=0): self._ll_tables.sort(edge_start) # TODO add provenance + def canonicalise(self): + """ + This puts the tables in *canonical* form - to do this, the individual + and population tables are sorted by the first node that refers to each + (see :meth:`TreeSequence.subset`, and note that individuals and + populations not referred to by any nodes will be put at the end of the + tables in their original order). Then, the remaining tables are sorted + as in :meth:`.sort`, with the modification that mutations are sorted by + site, then time, then number of descendant mutations (ensuring that + parent mutations occur before children), then node, then original order + in the tables. This ensures that any two tables with the same + information should be identical after canonical sorting, unless they + have multiple individuals or populations without nodes that refer to + them. + """ + self._ll_tables.canonicalise() + # TODO add provenance + def compute_mutation_parents(self): """ Modifies the tables in place, computing the ``parent`` column of the @@ -2739,6 +2757,9 @@ def deduplicate_sites(self): duplicate ``position`` (and keeping only the *first* entry for each site), and renumbering the ``site`` column of the mutation table appropriately. This requires the site table to be sorted by position. + + ..warning:: This method does not sort the tables afterwards, so + mutations may no longer be sorted by time. """ self._ll_tables.deduplicate_sites() # TODO add provenance @@ -3015,20 +3036,46 @@ def drop_index(self): """ self._ll_tables.drop_index() - def subset(self, nodes, record_provenance=True): + def subset( + self, + nodes, + record_provenance=True, + filter_populations=True, + filter_individuals=True, + filter_sites=True, + canonicalise=False, + ): """ Modifies the tables in place to contain only the entries referring to - the provided list of nodes, with nodes reordered according to the order - they appear in the list. See :meth:`TreeSequence.subset` for a more - detailed description. + the provided list of node IDs, with nodes reordered according to the + order they appear in the list. See :meth:`TreeSequence.subset` for a + more detailed description. + + Note: there are no sortedness requirements on the tables. :param list nodes: The list of nodes for which to retain information. This may be a numpy array (or array-like) object (dtype=np.int32). :param bool record_provenance: Whether to record a provenance entry in the provenance table for this operation. + :param bool filter_populations: Whether to remove populations not referenced by + retained nodes. If False, the population table will not be altered + in any way. + :param bool filter_individuals: Whether to remove individuals not referenced by + retained nodes. If False, the individuals table will not be altered + in any way. + :param bool filter_sites: Whether to remove sites not referenced by + retained mutations. If False, the site table will remain unchanged. + :param bool canonicalise: If True, retains all unused entries, putting + unreferenced individuals and populations last. """ nodes = util.safe_np_int_cast(nodes, np.int32) - self._ll_tables.subset(nodes) + self._ll_tables.subset( + nodes, + filter_populations=filter_populations, + filter_individuals=filter_individuals, + filter_sites=filter_sites, + canonicalise=canonicalise, + ) if record_provenance: parameters = {"command": "subset", "nodes": nodes.tolist()} self.provenances.add_row( diff --git a/python/tskit/trees.py b/python/tskit/trees.py index d392baae32..81bb126cea 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5133,42 +5133,83 @@ def trim(self, record_provenance=True): tables.trim(record_provenance) return tables.tree_sequence() - def subset(self, nodes, record_provenance=True): + def subset( + self, + nodes, + record_provenance=True, + filter_populations=True, + filter_individuals=True, + filter_sites=True, + canonicalise=False, + ): """ Returns a tree sequence modified to contain only the entries referring to the provided list of nodes, with nodes reordered according to the order - they appear in the ``nodes`` argument. Specifically, this subsets and reorders - each of the tables as follows: + they appear in the ``nodes`` argument. Note that this does *not* retain + the ancestry of these nodes - for that, see ::meth::`.simplify`. + Specifically, this subsets and reorders each of the tables of the tree + sequence as follows: 1. Nodes: if in the list of nodes, and in the order provided. - 2. Individuals and Populations: if referred to by a retained node, - and in the order first seen when traversing the list of retained nodes. + + 2. Individuals and Populations: if referred to by a retained node, and + in the order first seen when traversing the list of retained nodes. + 3. Edges: if both parent and child are retained nodes. + 4. Mutations: if the mutation's node is a retained node. + 5. Sites: if any mutations remain at the site after removing mutations. - Retained edges, mutations, and sites appear in the same - order as in the original tables. + Retained edges, mutations, and sites appear in the same order as in the + original tree sequence. If ``nodes`` is the entire list of nodes in the tables, then the - resulting tables will be identical to the original tables, but with - nodes (and individuals and populations) reordered. - - To instead subset the tables to a given portion of the *genome*, see + resulting tree sequence will be identical to the original tree + sequence, but with nodes (and individuals and populations) reordered. + + Note that individuals or populations not referred to by any remaining + nodes, and sites not referred to by any remaining mutations, + will be removed. This behavior is modified by setting ``filter_populations``, + ``filter_individuals``, and/or ``filter_sites`` to False, + in which case the relevant tables will remain unchanged. (This can be + helpful, for instance, to keep population IDs stable.) + + The ``canonicalise`` option reorders as above but retains even + unreferenced individuals, populations, and sites; unreferenced + individuals and populations will be last. This is useful to reorder + without losing information. + + To instead subset the tree sequence to a given portion of the *genome*, see :meth:`.keep_intervals`. **Note:** This is quite different from :meth:`.simplify`: the resulting - tables contain only the nodes given, not ancestral ones as well, and + tree sequence contain only the nodes given, not ancestral ones as well, and does not simplify the relationships in any way. :param list nodes: The list of nodes for which to retain information. This may be a numpy array (or array-like) object (dtype=np.int32). :param bool record_provenance: If True, add details of this operation to the provenance information of the returned tree sequence. (Default: True). + :param bool filter_populations: Whether to remove populations not referenced by + retained nodes. If False, the population table will remain unchanged. + :param bool filter_individuals: Whether to remove individuals not referenced by + retained nodes. If False, the individuals table will remain unchanged. + :param bool filter_sites: Whether to remove sites not referenced by + retained mutations. If False, the site table will remain unchanged. + :param bool canonicalise: If True, retains all unused entries, putting + unreferenced individuals and populations last. :rtype: .TreeSequence """ tables = self.dump_tables() - tables.subset(nodes, record_provenance) + tables.subset( + nodes, + record_provenance=record_provenance, + filter_populations=filter_populations, + filter_individuals=filter_individuals, + filter_sites=filter_sites, + canonicalise=canonicalise, + ) return tables.tree_sequence() def union( From 6086af39f567b2397799451a823fb1f53a5930f8 Mon Sep 17 00:00:00 2001 From: peter Date: Tue, 26 Jan 2021 20:25:00 -0800 Subject: [PATCH 2/6] canonical form --- c/tskit/tables.c | 1 - c/tskit/tables.h | 4 ++-- python/_tskitmodule.c | 3 +-- 3 files changed, 3 insertions(+), 5 deletions(-) diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 106baeee47..d5adb6475d 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -8929,7 +8929,6 @@ tsk_table_collection_sort( if (ret != 0) { goto out; } - ret = tsk_table_sorter_run(&sorter, start); if (ret != 0) { goto out; diff --git a/c/tskit/tables.h b/c/tskit/tables.h index cdb0964ede..dca61c2a91 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -2654,10 +2654,10 @@ int tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); /** -@brief Puts the tables into canonical order. +@brief Puts the tables into canonical form. @rst -This method puts tables in canonical order, which exceeds the usual +This method puts tables in canonical form, which exceeds the usual sortedness requirements in such a way that randomly reshuffled tables are guaranteed to always be sorted to the same order (with the exception of individuals or populations that are not referenced by any nodes). In diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index f0b244954a..5143e40276 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -5352,7 +5352,6 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) } memset(&start, 0, sizeof(start)); start.edges = edge_start; - err = tsk_table_collection_sort(self->tables, &start, 0); if (err != 0) { handle_library_error(err); @@ -5791,7 +5790,7 @@ static PyMethodDef TableCollection_methods[] = { { .ml_name = "canonicalise", .ml_meth = (PyCFunction) TableCollection_canonicalise, .ml_flags = METH_NOARGS, - .ml_doc = "Puts the tables in canonical order." }, + .ml_doc = "Puts the tables in canonical form." }, { .ml_name = "equals", .ml_meth = (PyCFunction) TableCollection_equals, .ml_flags = METH_VARARGS | METH_KEYWORDS, From 10b21b3678121b0480096b0bb36441bbd9fed2d6 Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 29 Jan 2021 09:42:46 -0800 Subject: [PATCH 3/6] fixup subset API --- c/CHANGELOG.rst | 11 ++++ c/tests/test_tables.c | 48 +++++++---------- c/tskit/tables.c | 30 +++-------- c/tskit/tables.h | 53 ++++++++++--------- python/_tskitmodule.c | 28 ++++------ python/tests/test_tables.py | 101 ++++++++++++------------------------ python/tests/tsutil.py | 20 +++---- python/tskit/tables.py | 26 ++++------ python/tskit/trees.py | 91 +++++++++++++------------------- 9 files changed, 160 insertions(+), 248 deletions(-) diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index aba42781e5..b82bfa30bd 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -7,10 +7,21 @@ - Add ``parents`` to the individual table to enable recording of pedigrees (:user:`ivan-krukov`, :user:`benjeffery`, :issue:`852`, :pr:`1125`, :pr:`866`, :pr:`1153`, :pr:`1177`). +- Added a ``tsk_table_collection_canonicalse`` method, that allows checking for equality between + tables that are equivalent up to reordering (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). + +- Removed a previous requirement on ``tsk_table_collection_union``, allowing for unioning of + new information both above and below shared history (:user:`petrelharp`, :user:`mufernando`, :pr:`1108`). + **Breaking changes** - Method ``tsk_individual_table_add_row`` has an extra arguments ``parents`` and ``parents_length``. +**Breaking changes** + +- Add an ``options`` argument to ``tsk_table_collection_subset`` (:user:`petrelharp`, :pr:`1108`), + to allow for retaining the order of populations. + **Bugfixes** ---------------------- diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 1ed130c946..e314c602cf 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -4108,11 +4108,6 @@ test_sort_tables_canonical(void) CU_ASSERT_EQUAL_FATAL(ret, 0); t2.sequence_length = 1.0; - ret = tsk_table_collection_clear(&t1, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_clear(&t2, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - parse_nodes(nodes, &t1.nodes); CU_ASSERT_EQUAL_FATAL(t1.nodes.num_rows, 7); parse_individuals(individuals, &t1.individuals); @@ -5149,8 +5144,7 @@ test_table_collection_subset_with_options(tsk_flags_t options) // empty nodes should get empty tables ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, NULL, 0, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&tables_copy, NULL, 0, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); @@ -5158,11 +5152,10 @@ test_table_collection_subset_with_options(tsk_flags_t options) CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); - // unless FILTER_POPULATIONS is not provided + // unless NO_CHANGE_POPULATIONS is provided ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset( - &tables_copy, NULL, 0, TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&tables_copy, NULL, 0, TSK_NO_CHANGE_POPULATIONS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); @@ -5171,28 +5164,28 @@ test_table_collection_subset_with_options(tsk_flags_t options) CU_ASSERT_FATAL( tsk_population_table_equals(&tables.populations, &tables_copy.populations, 0)); - // or FILTER_INDIVIDUALS + // or KEEP_UNREFERENCED ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset( - &tables_copy, NULL, 0, TSK_FILTER_POPULATIONS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&tables_copy, NULL, 0, TSK_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 3); - CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); - CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 2); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); + CU_ASSERT_FATAL(tsk_site_table_equals(&tables.sites, &tables_copy.sites, 0)); - // or KEEP_UNUSED_SITES + // or both ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = tsk_table_collection_subset( - &tables_copy, NULL, 0, TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS); + &tables_copy, NULL, 0, TSK_KEEP_UNREFERENCED | TSK_NO_CHANGE_POPULATIONS); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, 0); - CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 0); - CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, 0); + CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 3); CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, 0); + CU_ASSERT_FATAL( + tsk_population_table_equals(&tables.populations, &tables_copy.populations, 0)); CU_ASSERT_FATAL(tsk_site_table_equals(&tables.sites, &tables_copy.sites, 0)); // the identity transformation, since unused inds/pops are at the end @@ -5201,15 +5194,14 @@ test_table_collection_subset_with_options(tsk_flags_t options) } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); // or, remove unused things: ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_node_table_equals(&tables.nodes, &tables_copy.nodes, 0)); CU_ASSERT_EQUAL_FATAL(tables_copy.individuals.num_rows, 2); @@ -5224,9 +5216,9 @@ test_table_collection_subset_with_options(tsk_flags_t options) } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_CANONICALISE); + ret = tsk_table_collection_subset(&tables_copy, nodes, 4, TSK_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); @@ -5291,7 +5283,7 @@ test_table_collection_subset_unsorted(void) } ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT); CU_ASSERT_EQUAL_FATAL(ret, 0); - ret = tsk_table_collection_subset(&tables_copy, nodes, 3, TSK_CANONICALISE); + ret = tsk_table_collection_subset(&tables_copy, nodes, 3, TSK_KEEP_UNREFERENCED); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); @@ -5469,8 +5461,7 @@ test_table_collection_union(void) node_mapping[0] = 0; node_mapping[1] = 1; node_mapping[2] = 2; - ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy, 0)); @@ -5659,8 +5650,7 @@ test_table_collection_union_middle_merge(void) ret = tsk_table_collection_union(&ta, &tb, node_mapping, 0); CU_ASSERT_EQUAL(ret, 0); - ret = tsk_table_collection_subset(&ta, node_order, 5, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&ta, node_order, 5, 0); CU_ASSERT_EQUAL(ret, 0); ret = tsk_provenance_table_clear(&ta.provenances); CU_ASSERT_EQUAL(ret, 0); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index d5adb6475d..b73fc9bbee 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4983,7 +4983,7 @@ tsk_table_collection_canonicalise( nodes[k] = k; } ret = tsk_table_collection_subset( - self, nodes, self->nodes.num_rows, TSK_CANONICALISE); + self, nodes, self->nodes.num_rows, TSK_KEEP_UNREFERENCED); if (ret != 0) { goto out; } @@ -9455,10 +9455,8 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, tsk_population_t pop; tsk_site_t site; tsk_mutation_t mut; - bool canonicalise = !!(options & TSK_CANONICALISE); - bool no_filter_populations = !(options & TSK_FILTER_POPULATIONS); - bool no_filter_individuals = !(options & TSK_FILTER_INDIVIDUALS); - bool no_filter_sites = !(options & TSK_FILTER_SITES); + bool keep_unreferenced = !!(options & TSK_KEEP_UNREFERENCED); + bool no_change_populations = !!(options & TSK_NO_CHANGE_POPULATIONS); ret = tsk_table_collection_copy(self, &tables, 0); if (ret != 0) { @@ -9489,17 +9487,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, memset(site_map, 0xff, tables.sites.num_rows * sizeof(*site_map)); memset(mutation_map, 0xff, tables.mutations.num_rows * sizeof(*mutation_map)); - if (no_filter_individuals && !canonicalise) { - ret = tsk_individual_table_copy( - &tables.individuals, &self->individuals, TSK_NO_INIT); - if (ret < 0) { - goto out; - } - for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { - individual_map[k] = k; - } - } - if (no_filter_populations && !canonicalise) { + if (no_change_populations) { ret = tsk_population_table_copy( &tables.populations, &self->populations, TSK_NO_INIT); if (ret < 0) { @@ -9518,7 +9506,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, goto out; } } - if (canonicalise) { + if (keep_unreferenced) { // keep unused individuals and populations for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { if (individual_map[k] == TSK_NULL) { @@ -9575,7 +9563,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, // keep retained sites in their original order j = 0; for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) { - if (canonicalise || no_filter_sites || site_map[k] != TSK_NULL) { + if (keep_unreferenced || site_map[k] != TSK_NULL) { tsk_site_table_get_row_unsafe(&tables.sites, k, &site); ret = tsk_site_table_add_row(&self->sites, site.position, site.ancestral_state, site.ancestral_state_length, site.metadata, @@ -9665,13 +9653,11 @@ tsk_check_subset_equality(tsk_table_collection_t *self, if (ret != 0) { goto out; } - ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&self_copy, self_nodes, num_shared_nodes, 0); if (ret != 0) { goto out; } - ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes, - TSK_FILTER_POPULATIONS | TSK_FILTER_INDIVIDUALS | TSK_FILTER_SITES); + ret = tsk_table_collection_subset(&other_copy, other_nodes, num_shared_nodes, 0); if (ret != 0) { goto out; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index dca61c2a91..d68fc593e0 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -700,7 +700,7 @@ typedef struct { /**@} */ -/* Flags for simplify() and/or subset() */ +/* Flags for simplify() */ #define TSK_FILTER_SITES (1 << 0) #define TSK_FILTER_POPULATIONS (1 << 1) #define TSK_FILTER_INDIVIDUALS (1 << 2) @@ -708,7 +708,10 @@ typedef struct { #define TSK_KEEP_UNARY (1 << 4) #define TSK_KEEP_INPUT_ROOTS (1 << 5) #define TSK_KEEP_UNARY_IN_INDIVIDUALS (1 << 6) -#define TSK_CANONICALISE (1 << 7) + +/* Flags for subset() */ +#define TSK_NO_CHANGE_POPULATIONS (1 << 0) +#define TSK_KEEP_UNREFERENCED (1 << 1) /* Flags for check_integrity */ #define TSK_CHECK_EDGE_ORDERING (1 << 0) @@ -2748,36 +2751,36 @@ int tsk_table_collection_simplify(tsk_table_collection_t *self, const tsk_id_t * Reduces the table collection to contain only the entries referring to the provided list of nodes, with nodes reordered according to the order they appear in the ``nodes`` argument. Specifically, this subsets and reorders -each of the tables as follows: +each of the tables as follows (but see options, below): 1. Nodes: if in the list of nodes, and in the order provided. -2. Individuals, if `TSK_FILTER_INDIVIDUALS`: if referred to by a retained node, - and in the order first seen when traversing the list of retained nodes. -3. Populations, if `TSK_FILTER_POPULATIONS`: the same as Individuals. -4. Edges: if both parent and child are retained nodes. -5. Mutations: if the mutation's node is a retained node. -6. Sites, if `TSK_FILTER_SITES`: if any mutations remain at the site after - removing mutations. +2. Individuals and Populations: if referred to by a retained node, and in the + order first seen when traversing the list of retained nodes. +3. Edges: if both parent and child are retained nodes. +4. Mutations: if the mutation's node is a retained node. +5. Sites: if any mutations remain at the site after removing mutations. Retained edges, mutations, and sites appear in the same -order as in the original tables. If any of `TSK_FILTER_INDIVIDUALS`, -`TSK_FILTER_POPULATIONS`, or `TSK_FILTER_SITES` are *not* provided, -then the respective tables will be *unchanged*. - -If ``nodes`` is the entire list of nodes in the tables, then the -resulting tables will be identical to the original tables, but with -nodes (and individuals and populations) reordered. - -An additional option, `TSK_CANONICALISE`, will reorder both individual and -population tables, as described above if `TSK_FILTER_INDIVIDUALS` and -`TSK_FILTER_POPULATIONS` were provided, but will retain any unreferenced -individuals and populations at the end of the individual and population tables. -It will also keep unreferenced sites (as if `TSK_FILTER_SITES` was not -provided). This is useful so that this can be used to reorder the tables -without losing information. +order as in the original tables. This function does *not* require the tables to be sorted. +**Options**: + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_NO_CHANGE_POPULATIONS + If this flag is provided, the population table will not be changed + in any way. + +TSK_KEEP_UNREFERENCED + If this flag is provided, then unreferenced sites, individuals, and populations + will not be removed. If so, the site table will not be changed, + unreferenced individuals will be placed last, in their original order, and + (unless TSK_NO_CHANGE_POPULATIONS is also provided), unreferenced + populations will also be placed last, in their original order. + .. note:: Migrations are currently not supported by susbset, and an error will be raised if we attempt call subset on a table collection with greater than zero migrations. diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 5143e40276..c3d73eade5 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -5091,19 +5091,17 @@ TableCollection_subset(TableCollection *self, PyObject *args, PyObject *kwds) PyArrayObject *nodes_array = NULL; npy_intp *shape; tsk_flags_t options = 0; - int filter_populations = true; - int filter_individuals = true; - int filter_sites = true; - int canonicalise = false; + int reorder_populations = true; + int remove_unreferenced = true; size_t num_nodes; - static char *kwlist[] = { "nodes", "filter_populations", "filter_individuals", - "filter_sites", "canonicalise", NULL }; + static char *kwlist[] + = { "nodes", "reorder_populations", "remove_unreferenced", NULL }; if (TableCollection_check_state(self) != 0) { goto out; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|iiii", kwlist, &nodes, - &filter_populations, &filter_individuals, &filter_sites, &canonicalise)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O|ii", kwlist, &nodes, + &reorder_populations, &remove_unreferenced)) { goto out; } nodes_array @@ -5113,17 +5111,11 @@ TableCollection_subset(TableCollection *self, PyObject *args, PyObject *kwds) } shape = PyArray_DIMS(nodes_array); num_nodes = shape[0]; - if (filter_populations) { - options |= TSK_FILTER_POPULATIONS; - } - if (filter_individuals) { - options |= TSK_FILTER_INDIVIDUALS; - } - if (filter_sites) { - options |= TSK_FILTER_SITES; + if (!reorder_populations) { + options |= TSK_NO_CHANGE_POPULATIONS; } - if (canonicalise) { - options |= TSK_CANONICALISE; + if (!remove_unreferenced) { + options |= TSK_KEEP_UNREFERENCED; } err = tsk_table_collection_subset( diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 6672fed907..2549553b48 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -3235,38 +3235,24 @@ def get_examples(self, seed): yield self.get_wf_example(seed=seed) def verify_subset_equality(self, tables, nodes): - for kp, ki, ks, cn in [ - [False, False, False, False], - [True, False, False, False], - [False, True, False, False], - [False, False, True, False], - [True, True, False, False], - [True, False, True, False], - [False, True, True, False], - [True, True, True, False], - [False, False, False, True], - [True, False, False, True], - ]: - sub1 = tables.copy() - sub2 = tables.copy() - tsutil.py_subset( - sub1, - nodes, - record_provenance=False, - filter_populations=kp, - filter_individuals=ki, - filter_sites=ks, - canonicalise=cn, - ) - sub2.subset( - nodes, - record_provenance=False, - filter_populations=kp, - filter_individuals=ki, - filter_sites=ks, - canonicalise=cn, - ) - tsutil.assert_table_collections_equal(sub1, sub2) + for rp in [True, False]: + for ru in [True, False]: + sub1 = tables.copy() + sub2 = tables.copy() + tsutil.py_subset( + sub1, + nodes, + record_provenance=False, + reorder_populations=rp, + remove_unreferenced=ru, + ) + sub2.subset( + nodes, + record_provenance=False, + reorder_populations=rp, + remove_unreferenced=ru, + ) + tsutil.assert_table_collections_equal(sub1, sub2) def verify_subset(self, tables, nodes): self.verify_subset_equality(tables, nodes) @@ -3377,7 +3363,7 @@ def test_shuffled_tables(self): tsutil.shuffle_tables(tables2, 7000) tables2.subset( np.arange(tables.nodes.num_rows), - canonicalise=True, + remove_unreferenced=False, ) assert tables.nodes.num_rows == tables2.nodes.num_rows assert tables.individuals.num_rows == tables2.individuals.num_rows @@ -3415,46 +3401,27 @@ def test_empty_nodes(self): assert subset.mutations.num_rows == 0 assert subset.provenances == tables.provenances - def test_no_filter_sites(self): + def test_no_remove_unreferenced(self): tables = tskit.TableCollection(sequence_length=10) tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) tables.nodes.add_row(time=1) tables.edges.add_row(parent=1, child=0, left=0, right=10) - for k in range(10): + for k in range(5): tables.sites.add_row(position=k, ancestral_state=str(k)) - sub_tables = tables.copy() - sub_tables.subset([], filter_sites=False) - assert tables.sites == sub_tables.sites - ts = tables.tree_sequence() - sub_tables = ts.subset([], filter_sites=False).tables - assert tables.sites == sub_tables.sites - - def test_no_filter_populations(self): - tables = tskit.TableCollection(sequence_length=10) - tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) - tables.nodes.add_row(time=1) - tables.edges.add_row(parent=1, child=0, left=0, right=10) - for k in range(10): + # these are all unused, so should remain unchanged + for k in range(5): tables.populations.add_row(metadata=str(k).encode()) - sub_tables = tables.copy() - sub_tables.subset([], filter_populations=False) - assert tables.populations == sub_tables.populations - ts = tables.tree_sequence() - sub_tables = ts.subset([], filter_populations=False).tables - assert tables.populations == sub_tables.populations - - def test_no_filter_individuals(self): - tables = tskit.TableCollection(sequence_length=10) - tables.nodes.add_row(time=0, flags=tskit.NODE_IS_SAMPLE) - tables.nodes.add_row(time=1) - tables.edges.add_row(parent=1, child=0, left=0, right=10) - for k in range(10): + for k in range(5): tables.individuals.add_row(metadata=str(k).encode()) sub_tables = tables.copy() - sub_tables.subset([], filter_individuals=False) + sub_tables.subset([], remove_unreferenced=False) + assert tables.sites == sub_tables.sites + assert tables.populations == sub_tables.populations assert tables.individuals == sub_tables.individuals ts = tables.tree_sequence() - sub_tables = ts.subset([], filter_individuals=False).tables + sub_tables = ts.subset([], remove_unreferenced=False).tables + assert tables.sites == sub_tables.sites + assert tables.populations == sub_tables.populations assert tables.individuals == sub_tables.individuals @@ -3623,7 +3590,7 @@ def verify_subset_union(self, ts): tables = ts.tables # remove unused individuals and populations since if there's more than # one of these then it can't be canonically sorted below - tables.subset(np.arange(tables.nodes.num_rows), filter_sites=False) + tables.subset(np.arange(tables.nodes.num_rows)) has_unknown_times = np.any(tskit.is_unknown_time(tables.mutations.time)) if has_unknown_times: tsutil.shuffle_tables(tables, seed=2545, keep_mutation_parent_order=True) @@ -3644,10 +3611,10 @@ def verify_subset_union(self, ts): # relationships to target target_plus = np.concatenate([target, target_relatives]) sub1 = tables.copy() - sub1.subset(target_plus, filter_populations=False, filter_sites=False) + sub1.subset(target_plus, reorder_populations=False) # everything else new_tables = tables.copy() - new_tables.subset(not_target, filter_populations=False, filter_sites=False) + new_tables.subset(not_target, reorder_populations=False) # and union back together mapping12 = [tskit.NULL for _ in target] + [ not_target.index(n) for n in target_relatives @@ -3661,7 +3628,7 @@ def verify_subset_union(self, ts): for j, t in enumerate(target): reorder[t:] -= 1 reorder[t] = new_tables.nodes.num_rows - len(target) + j - new_tables.subset(reorder, filter_sites=False) + new_tables.subset(reorder) new_tables.canonicalise() tables.canonicalise() tsutil.assert_table_collections_equal( diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 94e44871ad..fc0127f9da 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -616,10 +616,8 @@ def py_subset( tables, nodes, record_provenance=True, - filter_populations=True, - filter_individuals=True, - filter_sites=True, - canonicalise=False, + reorder_populations=True, + remove_unreferenced=True, ): """ Naive implementation of the TableCollection.subset method using the Python API. @@ -632,16 +630,10 @@ def py_subset( node_map = {} ind_map = {tskit.NULL: tskit.NULL} pop_map = {tskit.NULL: tskit.NULL} - if not filter_populations and not canonicalise: + if not reorder_populations: for j, pop in enumerate(full.populations): pop_map[j] = j tables.populations.add_row(metadata=pop.metadata) - if not filter_individuals and not canonicalise: - for j, ind in enumerate(full.individuals): - ind_map[j] = j - tables.individuals.add_row( - flags=ind.flags, location=ind.location, metadata=ind.metadata - ) for old_id in nodes: node = full.nodes[old_id] if node.individual not in ind_map and node.individual != tskit.NULL: @@ -662,7 +654,7 @@ def py_subset( node.metadata, ) node_map[old_id] = new_id - if canonicalise: + if not remove_unreferenced: for j, ind in enumerate(full.individuals): if j not in ind_map: ind_map[j] = tables.individuals.add_row( @@ -683,7 +675,7 @@ def py_subset( if full.migrations.num_rows > 0: raise ValueError("Migrations are currently not supported in this operation.") site_map = {} - if canonicalise or not filter_sites: + if not remove_unreferenced: for j, site in enumerate(full.sites): site_map[j] = tables.sites.add_row( site.position, site.ancestral_state, site.metadata @@ -1012,7 +1004,7 @@ def py_sort(tables, canonical=False): tables.subset( np.arange(tables.nodes.num_rows), record_provenance=False, - canonicalise=True, + remove_unreferenced=False, ) num_descendants = compute_mutation_num_descendants(tables) diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 3d68615905..5cf8e03504 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3040,10 +3040,8 @@ def subset( self, nodes, record_provenance=True, - filter_populations=True, - filter_individuals=True, - filter_sites=True, - canonicalise=False, + reorder_populations=True, + remove_unreferenced=True, ): """ Modifies the tables in place to contain only the entries referring to @@ -3057,24 +3055,18 @@ def subset( may be a numpy array (or array-like) object (dtype=np.int32). :param bool record_provenance: Whether to record a provenance entry in the provenance table for this operation. - :param bool filter_populations: Whether to remove populations not referenced by - retained nodes. If False, the population table will not be altered - in any way. - :param bool filter_individuals: Whether to remove individuals not referenced by - retained nodes. If False, the individuals table will not be altered + :param bool reorder_populations: Whether to reorder the population table + (default: True). If False, the population table will not be altered in any way. - :param bool filter_sites: Whether to remove sites not referenced by - retained mutations. If False, the site table will remain unchanged. - :param bool canonicalise: If True, retains all unused entries, putting - unreferenced individuals and populations last. + :param bool remove_unreferenced: Whether sites, individuals, and populations + that are not referred to by any retained entries in the tables should + be removed (default: True). See the description for details. """ nodes = util.safe_np_int_cast(nodes, np.int32) self._ll_tables.subset( nodes, - filter_populations=filter_populations, - filter_individuals=filter_individuals, - filter_sites=filter_sites, - canonicalise=canonicalise, + reorder_populations=reorder_populations, + remove_unreferenced=remove_unreferenced, ) if record_provenance: parameters = {"command": "subset", "nodes": nodes.tolist()} diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 81bb126cea..fa0b6f843a 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5137,78 +5137,57 @@ def subset( self, nodes, record_provenance=True, - filter_populations=True, - filter_individuals=True, - filter_sites=True, - canonicalise=False, + reorder_populations=True, + remove_unreferenced=True, ): """ - Returns a tree sequence modified to contain only the entries referring to - the provided list of nodes, with nodes reordered according to the order - they appear in the ``nodes`` argument. Note that this does *not* retain + Returns a tree sequence containing only information directly + referencing the provided list of nodes to retain. The result will + retain only the nodes whose IDs are listed in ``nodes``, only edges for + which both parent and child are in ``nodes```, only mutations whose + node is in ``nodes``, and only individuals that are referred to by one + of the retained nodes. Note that this does *not* retain the ancestry of these nodes - for that, see ::meth::`.simplify`. - Specifically, this subsets and reorders each of the tables of the tree - sequence as follows: - - 1. Nodes: if in the list of nodes, and in the order provided. - - 2. Individuals and Populations: if referred to by a retained node, and - in the order first seen when traversing the list of retained nodes. - - 3. Edges: if both parent and child are retained nodes. - - 4. Mutations: if the mutation's node is a retained node. - - 5. Sites: if any mutations remain at the site after removing mutations. - - Retained edges, mutations, and sites appear in the same order as in the - original tree sequence. - If ``nodes`` is the entire list of nodes in the tables, then the - resulting tree sequence will be identical to the original tree - sequence, but with nodes (and individuals and populations) reordered. + This has the side effect of reordering the nodes, individuals, and + populations in the tree sequence: the nodes in the new tree sequence + will be in the order provided in ``nodes``, and both individuals and + populations will be ordered by the earliest retained node that refers + to them. (However, ``reorder_populations`` may be set to False + to keep the population table unchanged.) - Note that individuals or populations not referred to by any remaining - nodes, and sites not referred to by any remaining mutations, - will be removed. This behavior is modified by setting ``filter_populations``, - ``filter_individuals``, and/or ``filter_sites`` to False, - in which case the relevant tables will remain unchanged. (This can be - helpful, for instance, to keep population IDs stable.) + By default, the method removes all individuals and populations not + referenced by any nodes, and all sites not referenced by any mutations. + To retain these unreferencd individuals, populations, and sites, pass + ``remove_unreferenced=False``. If this is done, the site table will + remain unchanged, unreferenced individuals will appear at the end of + the individuals table (and in their original order), and unreferenced + populations will appear at the end of the population table (unless + ``reorder_populations=False``). - The ``canonicalise`` option reorders as above but retains even - unreferenced individuals, populations, and sites; unreferenced - individuals and populations will be last. This is useful to reorder - without losing information. + .. seealso:: - To instead subset the tree sequence to a given portion of the *genome*, see - :meth:`.keep_intervals`. - - **Note:** This is quite different from :meth:`.simplify`: the resulting - tree sequence contain only the nodes given, not ancestral ones as well, and - does not simplify the relationships in any way. + :meth:`.keep_intervals` for subsetting a given portion of the genome; + :meth:`.simplify` for retaining the ancestry of a subset of nodes. :param list nodes: The list of nodes for which to retain information. This may be a numpy array (or array-like) object (dtype=np.int32). - :param bool record_provenance: If True, add details of this operation to the - provenance information of the returned tree sequence. (Default: True). - :param bool filter_populations: Whether to remove populations not referenced by - retained nodes. If False, the population table will remain unchanged. - :param bool filter_individuals: Whether to remove individuals not referenced by - retained nodes. If False, the individuals table will remain unchanged. - :param bool filter_sites: Whether to remove sites not referenced by - retained mutations. If False, the site table will remain unchanged. - :param bool canonicalise: If True, retains all unused entries, putting - unreferenced individuals and populations last. + :param bool record_provenance: Whether to record a provenance entry + in the provenance table for this operation. + :param bool reorder_populations: Whether to reorder populations + (default: True). If False, the population table will not be altered in + any way. + :param bool remove_unreferenced: Whether sites, individuals, and populations + that are not referred to by any retained entries in the tables should + be removed (default: True). See the description for details. :rtype: .TreeSequence """ tables = self.dump_tables() tables.subset( nodes, record_provenance=record_provenance, - filter_populations=filter_populations, - filter_individuals=filter_individuals, - filter_sites=filter_sites, - canonicalise=canonicalise, + reorder_populations=reorder_populations, + remove_unreferenced=remove_unreferenced, ) return tables.tree_sequence() From 0f3687a34d527a485011aa2c8f2409a287be2136 Mon Sep 17 00:00:00 2001 From: peter Date: Fri, 29 Jan 2021 14:55:38 -0800 Subject: [PATCH 4/6] added argument to canonicalise --- c/tests/test_tables.c | 40 +++++++++++++++-- c/tskit/tables.c | 7 ++- c/tskit/tables.h | 16 +++++-- python/_tskitmodule.c | 16 +++++-- python/tests/test_tables.py | 25 +++++------ python/tests/tsutil.py | 86 ++++++++++++++++++++++--------------- python/tskit/tables.py | 26 +++++++---- 7 files changed, 148 insertions(+), 68 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index e314c602cf..4ec12f7f22 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -4087,8 +4087,7 @@ test_sort_tables_canonical(void) "0 3 -1 -1\n"; const char *individuals_sorted = "0 1.0\n" "0 3.0\n" - "0 2.0\n" - "0 0.0\n"; + "0 2.0\n"; const char *sites_sorted = "0 0\n" "0.1 0\n" "0.2 0\n"; @@ -4100,6 +4099,10 @@ test_sort_tables_canonical(void) "2 2 1 -1 2\n" "2 1 4 4 0.5\n" "2 1 5 6 0.5\n"; + const char *individuals_sorted_kept = "0 1.0\n" + "0 3.0\n" + "0 2.0\n" + "0 0.0\n"; ret = tsk_table_collection_init(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -4128,10 +4131,9 @@ test_sort_tables_canonical(void) parse_nodes(nodes_sorted, &t2.nodes); tsk_population_table_add_row(&t2.populations, "C", 1); tsk_population_table_add_row(&t2.populations, "A", 1); - tsk_population_table_add_row(&t2.populations, "B", 1); CU_ASSERT_EQUAL_FATAL(t2.nodes.num_rows, 7); parse_individuals(individuals_sorted, &t2.individuals); - CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 4); + CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 3); parse_edges(single_tree_ex_edges, &t2.edges); CU_ASSERT_EQUAL_FATAL(t2.edges.num_rows, 6); parse_sites(sites_sorted, &t2.sites); @@ -4141,6 +4143,36 @@ test_sort_tables_canonical(void) CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); + ret = tsk_table_collection_clear(&t1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_clear(&t2, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + // now with KEEP_UNREFERENCED + parse_nodes(nodes, &t1.nodes); + parse_individuals(individuals, &t1.individuals); + tsk_population_table_add_row(&t1.populations, "A", 1); + tsk_population_table_add_row(&t1.populations, "B", 1); + tsk_population_table_add_row(&t1.populations, "C", 1); + parse_edges(single_tree_ex_edges, &t1.edges); + parse_sites(sites, &t1.sites); + parse_mutations(mutations, &t1.mutations); + + ret = tsk_table_collection_canonicalise(&t1, TSK_KEEP_UNREFERENCED); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + parse_nodes(nodes_sorted, &t2.nodes); + tsk_population_table_add_row(&t2.populations, "C", 1); + tsk_population_table_add_row(&t2.populations, "A", 1); + tsk_population_table_add_row(&t2.populations, "B", 1); + parse_individuals(individuals_sorted_kept, &t2.individuals); + CU_ASSERT_EQUAL_FATAL(t2.individuals.num_rows, 4); + parse_edges(single_tree_ex_edges, &t2.edges); + parse_sites(sites_sorted, &t2.sites); + parse_mutations(mutations_sorted, &t2.mutations); + + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); + tsk_table_collection_free(&t2); tsk_table_collection_free(&t1); } diff --git a/c/tskit/tables.c b/c/tskit/tables.c index b73fc9bbee..9f649004c4 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4961,13 +4961,13 @@ tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) } int TSK_WARN_UNUSED -tsk_table_collection_canonicalise( - tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) +tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options) { int ret = 0; tsk_id_t k; tsk_id_t *nodes = NULL; tsk_table_sorter_t sorter; + tsk_flags_t subset_options = options & TSK_KEEP_UNREFERENCED; ret = tsk_table_sorter_init(&sorter, self, 0); if (ret != 0) { @@ -4982,8 +4982,7 @@ tsk_table_collection_canonicalise( for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { nodes[k] = k; } - ret = tsk_table_collection_subset( - self, nodes, self->nodes.num_rows, TSK_KEEP_UNREFERENCED); + ret = tsk_table_collection_subset(self, nodes, self->nodes.num_rows, subset_options); if (ret != 0) { goto out; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index d68fc593e0..9a48b7b701 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -2662,9 +2662,19 @@ int tsk_table_collection_sort( @rst This method puts tables in canonical form, which exceeds the usual sortedness requirements in such a way that randomly reshuffled tables are -guaranteed to always be sorted to the same order (with the exception of -individuals or populations that are not referenced by any nodes). In -particular, this method puts mutation parents before their children. +guaranteed to always be sorted to the same order. In particular, this method +puts mutation parents before their children. + +**Options**: + +Options can be specified by providing one or more of the following bitwise +flags: + +TSK_KEEP_UNREFERENCED + By default, this will remove any unreferenced sites, populations, and + individuals. If this flag is provided, these will be retained, with + unreferenced individuals and populations at the end of the tables, in + their original order. @endrst diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index c3d73eade5..c3f595b11a 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -5355,15 +5355,25 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) } static PyObject * -TableCollection_canonicalise(TableCollection *self) +TableCollection_canonicalise(TableCollection *self, PyObject *args, PyObject *kwds) { int err; PyObject *ret = NULL; + tsk_flags_t options = 0; + int remove_unreferenced = true; + static char *kwlist[] = { "remove_unreferenced", NULL }; if (TableCollection_check_state(self) != 0) { goto out; } - err = tsk_table_collection_canonicalise(self->tables, 0); + if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist, &remove_unreferenced)) { + goto out; + } + if (!remove_unreferenced) { + options |= TSK_KEEP_UNREFERENCED; + } + + err = tsk_table_collection_canonicalise(self->tables, options); if (err != 0) { handle_library_error(err); goto out; @@ -5781,7 +5791,7 @@ static PyMethodDef TableCollection_methods[] = { .ml_doc = "Sorts the tables to satisfy tree sequence requirements." }, { .ml_name = "canonicalise", .ml_meth = (PyCFunction) TableCollection_canonicalise, - .ml_flags = METH_NOARGS, + .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Puts the tables in canonical form." }, { .ml_name = "equals", .ml_meth = (PyCFunction) TableCollection_equals, diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 2549553b48..149c08012a 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -1408,15 +1408,16 @@ def verify_sort_equality(self, tables, seed): tsutil.assert_table_collections_equal(tables1, tables2) def verify_canonical_equality(self, tables, seed): - tables1 = tables.copy() - tsutil.shuffle_tables( - tables1, - seed, - ) - tables2 = tables1.copy() - tables1.canonicalise() - tsutil.py_sort(tables2, canonical=True) - tsutil.assert_table_collections_equal(tables1, tables2) + for ru in [True, False]: + tables1 = tables.copy() + tsutil.shuffle_tables( + tables1, + seed, + ) + tables2 = tables1.copy() + tables1.canonicalise(remove_unreferenced=ru) + tsutil.py_canonicalise(tables2, remove_unreferenced=ru) + tsutil.assert_table_collections_equal(tables1, tables2) def verify_sort_mutation_consistency(self, orig_tables, seed): tables = orig_tables.copy() @@ -1456,17 +1457,17 @@ def verify_randomise_tables(self, orig_tables, seed): tsutil.assert_table_collections_equal(tables, sorted_tables) # Now also randomize sites and mutations - tables.canonicalise() + tables.canonicalise(remove_unreferenced=False) sorted_tables = tables.copy() tsutil.shuffle_tables( tables, seed=1234, shuffle_populations=False, shuffle_individuals=False ) - tables.canonicalise() + tables.canonicalise(remove_unreferenced=False) tsutil.assert_table_collections_equal(tables, sorted_tables) # Finally, randomize everything else tsutil.shuffle_tables(tables, seed=1234) - tables.canonicalise() + tables.canonicalise(remove_unreferenced=False) tsutil.assert_table_collections_equal(tables, sorted_tables) def verify_sort(self, tables, seed): diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index fc0127f9da..3fd29b55c9 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -840,8 +840,8 @@ def shuffle_tables( Randomizes the order of rows in (possibly) all except the Node table. Note that if mutations are completely shuffled, then TableCollection.sort() will not necessarily produce valid tables (unless all mutation times are present - and distinct), since it only puts parent mutations before children if - canonical=True. However, setting keep_mutation_parent_order to True will + and distinct), since currently only canonicalise puts parent mutations + before children. However, setting keep_mutation_parent_order to True will maintain the order of mutations within each site. :param TableCollection tables: The table collection that is shuffled (in place). @@ -941,14 +941,14 @@ def shuffle_tables( return tables -def orig_cmp_site(i, j, tables): +def cmp_site(i, j, tables): ret = tables.sites.position[i] - tables.sites.position[j] if ret == 0: ret = i - j return ret -def orig_cmp_mutation(i, j, tables, site_order, canonical=False, num_descendants=None): +def cmp_mutation_canonical(i, j, tables, site_order, num_descendants=None): site_i = tables.mutations.site[i] site_j = tables.mutations.site[j] ret = site_order[site_i] - site_order[site_j] @@ -958,17 +958,31 @@ def orig_cmp_mutation(i, j, tables, site_order, canonical=False, num_descendants and (not tskit.is_unknown_time(tables.mutations.time[j])) ): ret = tables.mutations.time[j] - tables.mutations.time[i] - if canonical: - if ret == 0: - ret = num_descendants[j] - num_descendants[i] - if ret == 0: - ret = tables.mutations.node[i] - tables.mutations.node[j] + if ret == 0: + ret = num_descendants[j] - num_descendants[i] + if ret == 0: + ret = tables.mutations.node[i] - tables.mutations.node[j] if ret == 0: ret = i - j return ret -def orig_cmp_edge(i, j, tables): +def cmp_mutation(i, j, tables, site_order): + site_i = tables.mutations.site[i] + site_j = tables.mutations.site[j] + ret = site_order[site_i] - site_order[site_j] + if ( + ret == 0 + and (not tskit.is_unknown_time(tables.mutations.time[i])) + and (not tskit.is_unknown_time(tables.mutations.time[j])) + ): + ret = tables.mutations.time[j] - tables.mutations.time[i] + if ret == 0: + ret = i - j + return ret + + +def cmp_edge(i, j, tables): ret = ( tables.nodes.time[tables.edges.parent[i]] - tables.nodes.time[tables.edges.parent[j]] @@ -992,22 +1006,16 @@ def compute_mutation_num_descendants(tables): return num_descendants -def py_sort(tables, canonical=False): - cmp_edge = orig_cmp_edge - cmp_site = orig_cmp_site - cmp_mutation = orig_cmp_mutation - num_descendants = np.zeros(tables.mutations.num_rows) - if canonical: - # Reorder nodes, populations, and individuals. - # Note that this will keep unreferenced populations and individuals, - # but will not put these in canonical order. - tables.subset( - np.arange(tables.nodes.num_rows), - record_provenance=False, - remove_unreferenced=False, - ) - num_descendants = compute_mutation_num_descendants(tables) +def py_canonicalise(tables, remove_unreferenced=True): + tables.subset( + np.arange(tables.nodes.num_rows), + record_provenance=False, + remove_unreferenced=remove_unreferenced, + ) + py_sort(tables, use_num_descendants=True) + +def py_sort(tables, use_num_descendants=False): copy = tables.copy() tables.edges.clear() tables.sites.clear() @@ -1018,16 +1026,26 @@ def py_sort(tables, canonical=False): sorted_sites = sorted(range(copy.sites.num_rows), key=site_key) site_id_map = {k: j for j, k in enumerate(sorted_sites)} site_order = np.argsort(sorted_sites) - mut_key = functools.cmp_to_key( - lambda a, b: cmp_mutation( - a, - b, - tables=copy, - site_order=site_order, - canonical=canonical, - num_descendants=num_descendants, + if use_num_descendants: + num_descendants = compute_mutation_num_descendants(copy) + mut_key = functools.cmp_to_key( + lambda a, b: cmp_mutation_canonical( + a, + b, + tables=copy, + site_order=site_order, + num_descendants=num_descendants, + ) + ) + else: + mut_key = functools.cmp_to_key( + lambda a, b: cmp_mutation( + a, + b, + tables=copy, + site_order=site_order, + ) ) - ) sorted_muts = sorted(range(copy.mutations.num_rows), key=mut_key) mut_id_map = {k: j for j, k in enumerate(sorted_muts)} mut_id_map[tskit.NULL] = tskit.NULL diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 5cf8e03504..f62881b42d 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2692,22 +2692,32 @@ def sort(self, edge_start=0): self._ll_tables.sort(edge_start) # TODO add provenance - def canonicalise(self): + def canonicalise(self, remove_unreferenced=True): """ This puts the tables in *canonical* form - to do this, the individual and population tables are sorted by the first node that refers to each - (see :meth:`TreeSequence.subset`, and note that individuals and - populations not referred to by any nodes will be put at the end of the - tables in their original order). Then, the remaining tables are sorted + (see :meth:`TreeSequence.subset`) Then, the remaining tables are sorted as in :meth:`.sort`, with the modification that mutations are sorted by site, then time, then number of descendant mutations (ensuring that parent mutations occur before children), then node, then original order in the tables. This ensures that any two tables with the same - information should be identical after canonical sorting, unless they - have multiple individuals or populations without nodes that refer to - them. + information should be identical after canonical sorting. + + By default, the method removes sites, individuals, and populations that + are not referenced (by mutations and nodes, respectively). If it is + desired to keep these, pass ``remove_unreferenced=False``, but note that + unreferenced individuals and populations are put at the end of the tables, + but not in canonical order. + + .. seealso:: + + :meth:`.sort` for sorting edges, mutations, and sites, and + :meth:`.subset` for reordering nodes, individuals, and populations. + + :param bool remove_unreferenced: Whether to remove unreferenced sites, + individuals, and populations. """ - self._ll_tables.canonicalise() + self._ll_tables.canonicalise(remove_unreferenced=remove_unreferenced) # TODO add provenance def compute_mutation_parents(self): From 357c45a3ca03a2dbf0cb15da1e116b2a7c990485 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 2 Feb 2021 09:38:36 +0000 Subject: [PATCH 5/6] Make individuals optional in C test parser. Fix some other merge issues. --- c/tests/test_tables.c | 3 ++- c/tests/testlib.c | 48 +++++++++++++++++++++++------------------- c/tskit/tables.c | 4 ++-- python/tests/tsutil.py | 6 ++++-- 4 files changed, 34 insertions(+), 27 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 4ec12f7f22..53282d73c8 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -5145,7 +5145,8 @@ test_table_collection_subset_with_options(tsk_flags_t options) &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); // unused individual - ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0); + ret = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, NULL, 0, NULL, 0); CU_ASSERT_FATAL(ret >= 0); ret = tsk_population_table_add_row(&tables.populations, NULL, 0); CU_ASSERT_FATAL(ret >= 0); diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 199c97ce26..8427e18bbd 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -638,30 +638,34 @@ parse_individuals(const char *text, tsk_individual_table_t *individual_table) q = strtok_r(NULL, ",", &q_cont); } CU_ASSERT_FATAL(q == NULL); + + /* parents and name are optional */ p = strtok_r(NULL, whitespace, &p_cont); - // the parents are comma-separated - parents_len = 1; - q = p; - while (*q != '\0') { - if (*q == ',') { - parents_len++; + parents_len = 0; + name = ""; + if (p != NULL) { + // the parents are comma-separated + parents_len = 1; + q = p; + while (*q != '\0') { + if (*q == ',') { + parents_len++; + } + q++; + } + CU_ASSERT_FATAL(parents_len >= 1); + strncpy(sub_line, p, MAX_LINE); + q = strtok_r(sub_line, ",", &q_cont); + for (k = 0; k < parents_len; k++) { + CU_ASSERT_FATAL(q != NULL); + parents[k] = atoi(q); + q = strtok_r(NULL, ",", &q_cont); + } + CU_ASSERT_FATAL(q == NULL); + p = strtok_r(NULL, whitespace, &p_cont); + if (p != NULL) { + name = p; } - q++; - } - CU_ASSERT_FATAL(parents_len >= 1); - strncpy(sub_line, p, MAX_LINE); - q = strtok_r(sub_line, ",", &q_cont); - for (k = 0; k < parents_len; k++) { - CU_ASSERT_FATAL(q != NULL); - parents[k] = atoi(q); - q = strtok_r(NULL, ",", &q_cont); - } - CU_ASSERT_FATAL(q == NULL); - p = strtok_r(NULL, whitespace, &p_cont); - if (p == NULL) { - name = ""; - } else { - name = p; } ret = tsk_individual_table_add_row(individual_table, flags, location, location_len, parents, parents_len, name, strlen(name)); diff --git a/c/tskit/tables.c b/c/tskit/tables.c index 9f649004c4..d06483db7e 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -9511,8 +9511,8 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, if (individual_map[k] == TSK_NULL) { tsk_individual_table_get_row_unsafe(&tables.individuals, k, &ind); ret = tsk_individual_table_add_row(&self->individuals, ind.flags, - ind.location, ind.location_length, ind.metadata, - ind.metadata_length); + ind.location, ind.location_length, ind.parents, ind.parents_length, + ind.metadata, ind.metadata_length); if (ret < 0) { goto out; } diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 3fd29b55c9..853e0944d7 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -658,7 +658,10 @@ def py_subset( for j, ind in enumerate(full.individuals): if j not in ind_map: ind_map[j] = tables.individuals.add_row( - ind.flags, ind.location, ind.metadata + ind.flags, + location=ind.location, + parents=ind.parents, + metadata=ind.metadata, ) for j, ind in enumerate(full.populations): if j not in pop_map: @@ -765,7 +768,6 @@ def py_union(tables, other, nodes, record_provenance=True, add_populations=True) parent=tskit.NULL, time=mut.time, metadata=mut.metadata, - time=mut.time, ) mut_map[other_id] = mut_id # migration table From 7176062c046e3124ecbf4daf908dd12eb3c84a37 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Tue, 2 Feb 2021 10:32:55 +0000 Subject: [PATCH 6/6] Minor updates to subset - Move C code to more logical locations. - Update new method signatures to avoid hard-coding defaults. - Various minor rephrasings. - Fill in gap in C test coverage. --- c/tests/test_tables.c | 10 +- c/tests/testlib.c | 1 + c/tskit/tables.c | 320 ++++++++++++++++++------------------ c/tskit/tables.h | 10 +- python/tests/test_tables.py | 2 +- python/tests/tsutil.py | 2 +- python/tskit/tables.py | 28 +++- 7 files changed, 191 insertions(+), 182 deletions(-) diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 53282d73c8..47c146ab12 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -4077,7 +4077,8 @@ test_sort_tables_canonical(void) "2 2 1 -1 2\n" "1 1 5 7 0.5\n" "1 2 1 -1 2\n" - "1 1 4 2 0.5\n"; + "1 1 4 2 0.5\n" + "1 1 6 7 0.5\n"; const char *nodes_sorted = "1 0 -1 0\n" "1 0 0 1\n" "1 0 1 -1\n" @@ -4098,7 +4099,8 @@ test_sort_tables_canonical(void) "2 4 3 -1 3\n" "2 2 1 -1 2\n" "2 1 4 4 0.5\n" - "2 1 5 6 0.5\n"; + "2 1 5 6 0.5\n" + "2 1 6 6 0.5\n"; const char *individuals_sorted_kept = "0 1.0\n" "0 3.0\n" "0 2.0\n" @@ -4123,7 +4125,7 @@ test_sort_tables_canonical(void) parse_sites(sites, &t1.sites); CU_ASSERT_EQUAL_FATAL(t1.sites.num_rows, 3); parse_mutations(mutations, &t1.mutations); - CU_ASSERT_EQUAL_FATAL(t1.mutations.num_rows, 8); + CU_ASSERT_EQUAL_FATAL(t1.mutations.num_rows, 9); ret = tsk_table_collection_canonicalise(&t1, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); @@ -4139,7 +4141,7 @@ test_sort_tables_canonical(void) parse_sites(sites_sorted, &t2.sites); parse_mutations(mutations_sorted, &t2.mutations); CU_ASSERT_EQUAL_FATAL(t2.sites.num_rows, 3); - CU_ASSERT_EQUAL_FATAL(t2.mutations.num_rows, 8); + CU_ASSERT_EQUAL_FATAL(t2.mutations.num_rows, 9); CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); diff --git a/c/tests/testlib.c b/c/tests/testlib.c index 8427e18bbd..9ebe005838 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -543,6 +543,7 @@ parse_mutations(const char *text, tsk_mutation_table_t *mutation_table) double time; char derived_state[MAX_LINE]; + /* site, node, derived_state, [parent, time] */ c = 0; while (text[c] != '\0') { /* Fill in the line */ diff --git a/c/tskit/tables.c b/c/tskit/tables.c index d06483db7e..52d99cfd02 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -4540,6 +4540,11 @@ typedef struct { tsk_size_t metadata_length; } edge_sort_t; +typedef struct { + tsk_mutation_t mut; + int num_descendants; +} mutation_canonical_sort_t; + static int cmp_site(const void *a, const void *b) { @@ -4577,6 +4582,30 @@ cmp_mutation(const void *a, const void *b) return ret; } +static int +cmp_mutation_canonical(const void *a, const void *b) +{ + const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; + const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; + /* Compare mutations by site */ + int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); + if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) + && !tsk_is_unknown_time(ib->mut.time)) { + ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); + } + if (ret == 0) { + ret = (ia->num_descendants < ib->num_descendants) + - (ia->num_descendants > ib->num_descendants); + } + if (ret == 0) { + ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); + } + if (ret == 0) { + ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); + } + return ret; +} + static int cmp_edge(const void *a, const void *b) { @@ -4759,6 +4788,81 @@ tsk_table_sorter_sort_mutations(tsk_table_sorter_t *self) return ret; } +static int +tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) +{ + int ret = 0; + tsk_size_t j; + tsk_id_t parent, mapped_parent, p; + tsk_mutation_table_t *mutations = &self->tables->mutations; + tsk_size_t num_mutations = mutations->num_rows; + tsk_mutation_table_t copy; + mutation_canonical_sort_t *sorted_mutations + = malloc(num_mutations * sizeof(*sorted_mutations)); + tsk_id_t *mutation_id_map = malloc(num_mutations * sizeof(*mutation_id_map)); + + ret = tsk_mutation_table_copy(mutations, ©, 0); + if (ret != 0) { + goto out; + } + if (mutation_id_map == NULL || sorted_mutations == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + + /* compute numbers of descendants for each mutation */ + for (j = 0; j < num_mutations; j++) { + sorted_mutations[j].num_descendants = 0; + } + for (j = 0; j < num_mutations; j++) { + p = mutations->parent[j]; + while (p != TSK_NULL) { + sorted_mutations[p].num_descendants += 1; + p = mutations->parent[p]; + } + } + + for (j = 0; j < num_mutations; j++) { + tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); + sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; + } + ret = tsk_mutation_table_clear(mutations); + if (ret != 0) { + goto out; + } + + qsort(sorted_mutations, num_mutations, sizeof(*sorted_mutations), + cmp_mutation_canonical); + + /* Make a first pass through the sorted mutations to build the ID map. */ + for (j = 0; j < num_mutations; j++) { + mutation_id_map[sorted_mutations[j].mut.id] = (tsk_id_t) j; + } + + for (j = 0; j < num_mutations; j++) { + mapped_parent = TSK_NULL; + parent = sorted_mutations[j].mut.parent; + if (parent != TSK_NULL) { + mapped_parent = mutation_id_map[parent]; + } + ret = tsk_mutation_table_add_row(mutations, sorted_mutations[j].mut.site, + sorted_mutations[j].mut.node, mapped_parent, sorted_mutations[j].mut.time, + sorted_mutations[j].mut.derived_state, + sorted_mutations[j].mut.derived_state_length, + sorted_mutations[j].mut.metadata, sorted_mutations[j].mut.metadata_length); + if (ret < 0) { + goto out; + } + } + ret = 0; + +out: + tsk_safe_free(mutation_id_map); + tsk_safe_free(sorted_mutations); + tsk_mutation_table_free(©); + return ret; +} + int tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) { @@ -4852,152 +4956,6 @@ tsk_table_sorter_free(tsk_table_sorter_t *self) return 0; } -/****************** - * Canonical sort - ******************/ - -typedef struct { - tsk_mutation_t mut; - int num_descendants; -} mutation_canonical_sort_t; - -static int -cmp_mutation_canonical(const void *a, const void *b) -{ - const mutation_canonical_sort_t *ia = (const mutation_canonical_sort_t *) a; - const mutation_canonical_sort_t *ib = (const mutation_canonical_sort_t *) b; - /* Compare mutations by site */ - int ret = (ia->mut.site > ib->mut.site) - (ia->mut.site < ib->mut.site); - if (ret == 0 && !tsk_is_unknown_time(ia->mut.time) - && !tsk_is_unknown_time(ib->mut.time)) { - ret = (ia->mut.time < ib->mut.time) - (ia->mut.time > ib->mut.time); - } - if (ret == 0) { - ret = (ia->num_descendants < ib->num_descendants) - - (ia->num_descendants > ib->num_descendants); - } - if (ret == 0) { - ret = (ia->mut.node > ib->mut.node) - (ia->mut.node < ib->mut.node); - } - if (ret == 0) { - ret = (ia->mut.id > ib->mut.id) - (ia->mut.id < ib->mut.id); - } - return ret; -} - -static int -tsk_table_sorter_sort_mutations_canonical(tsk_table_sorter_t *self) -{ - int ret = 0; - tsk_size_t j; - tsk_id_t parent, mapped_parent, p; - tsk_mutation_table_t *mutations = &self->tables->mutations; - tsk_size_t num_mutations = mutations->num_rows; - tsk_mutation_table_t copy; - mutation_canonical_sort_t *sorted_mutations - = malloc(num_mutations * sizeof(*sorted_mutations)); - tsk_id_t *mutation_id_map = malloc(num_mutations * sizeof(*mutation_id_map)); - - ret = tsk_mutation_table_copy(mutations, ©, 0); - if (ret != 0) { - goto out; - } - if (mutation_id_map == NULL || sorted_mutations == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - - /* compute numbers of descendants for each mutation */ - for (j = 0; j < num_mutations; j++) { - sorted_mutations[j].num_descendants = 0; - } - for (j = 0; j < num_mutations; j++) { - p = mutations->parent[j]; - while (p != TSK_NULL) { - sorted_mutations[p].num_descendants += 1; - p = mutations->parent[p]; - } - } - - for (j = 0; j < num_mutations; j++) { - tsk_mutation_table_get_row_unsafe(©, (tsk_id_t) j, &sorted_mutations[j].mut); - sorted_mutations[j].mut.site = self->site_id_map[sorted_mutations[j].mut.site]; - } - ret = tsk_mutation_table_clear(mutations); - if (ret != 0) { - goto out; - } - - qsort(sorted_mutations, num_mutations, sizeof(*sorted_mutations), - cmp_mutation_canonical); - - /* Make a first pass through the sorted mutations to build the ID map. */ - for (j = 0; j < num_mutations; j++) { - mutation_id_map[sorted_mutations[j].mut.id] = (tsk_id_t) j; - } - - for (j = 0; j < num_mutations; j++) { - mapped_parent = TSK_NULL; - parent = sorted_mutations[j].mut.parent; - if (parent != TSK_NULL) { - mapped_parent = mutation_id_map[parent]; - } - ret = tsk_mutation_table_add_row(mutations, sorted_mutations[j].mut.site, - sorted_mutations[j].mut.node, mapped_parent, sorted_mutations[j].mut.time, - sorted_mutations[j].mut.derived_state, - sorted_mutations[j].mut.derived_state_length, - sorted_mutations[j].mut.metadata, sorted_mutations[j].mut.metadata_length); - if (ret < 0) { - goto out; - } - } - ret = 0; - -out: - tsk_safe_free(mutation_id_map); - tsk_safe_free(sorted_mutations); - tsk_mutation_table_free(©); - return ret; -} - -int TSK_WARN_UNUSED -tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options) -{ - int ret = 0; - tsk_id_t k; - tsk_id_t *nodes = NULL; - tsk_table_sorter_t sorter; - tsk_flags_t subset_options = options & TSK_KEEP_UNREFERENCED; - - ret = tsk_table_sorter_init(&sorter, self, 0); - if (ret != 0) { - goto out; - } - - nodes = malloc(self->nodes.num_rows * sizeof(*nodes)); - if (nodes == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { - nodes[k] = k; - } - ret = tsk_table_collection_subset(self, nodes, self->nodes.num_rows, subset_options); - if (ret != 0) { - goto out; - } - sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; - - ret = tsk_table_sorter_run(&sorter, NULL); - if (ret != 0) { - goto out; - } -out: - tsk_safe_free(nodes); - tsk_table_sorter_free(&sorter); - return ret; -} - /************************* * segment overlapper *************************/ @@ -8937,6 +8895,43 @@ tsk_table_collection_sort( return ret; } +int TSK_WARN_UNUSED +tsk_table_collection_canonicalise(tsk_table_collection_t *self, tsk_flags_t options) +{ + int ret = 0; + tsk_id_t k; + tsk_id_t *nodes = NULL; + tsk_table_sorter_t sorter; + tsk_flags_t subset_options = options & TSK_KEEP_UNREFERENCED; + + ret = tsk_table_sorter_init(&sorter, self, 0); + if (ret != 0) { + goto out; + } + sorter.sort_mutations = tsk_table_sorter_sort_mutations_canonical; + + nodes = malloc(self->nodes.num_rows * sizeof(*nodes)); + if (nodes == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + for (k = 0; k < (tsk_id_t) self->nodes.num_rows; k++) { + nodes[k] = k; + } + ret = tsk_table_collection_subset(self, nodes, self->nodes.num_rows, subset_options); + if (ret != 0) { + goto out; + } + ret = tsk_table_sorter_run(&sorter, NULL); + if (ret != 0) { + goto out; + } +out: + tsk_safe_free(nodes); + tsk_table_sorter_free(&sorter); + return ret; +} + /* * Remove any sites with duplicate positions, retaining only the *first* * one. Assumes the tables have been sorted, throwing an error if not. @@ -9497,7 +9492,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, } } - // nodes, individuals, populations + // Nodes, individuals, populations for (k = 0; k < (tsk_id_t) num_nodes; k++) { ret = tsk_table_collection_add_and_remap_node( self, &tables, nodes[k], individual_map, population_map, node_map, true); @@ -9505,8 +9500,17 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, goto out; } } + + /* TODO: Subset the migrations table. We would need to make sure + * that we don't remove populations that are referenced, so it would + * need to be done before the next code block. */ + if (tables.migrations.num_rows != 0) { + ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; + goto out; + } + if (keep_unreferenced) { - // keep unused individuals and populations + // Keep unused individuals and populations for (k = 0; k < (tsk_id_t) tables.individuals.num_rows; k++) { if (individual_map[k] == TSK_NULL) { tsk_individual_table_get_row_unsafe(&tables.individuals, k, &ind); @@ -9530,7 +9534,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, } } - // edges + // Edges for (k = 0; k < (tsk_id_t) tables.edges.num_rows; k++) { tsk_edge_table_get_row_unsafe(&tables.edges, k, &edge); new_parent = node_map[edge.parent]; @@ -9544,9 +9548,9 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, } } - // mutations and sites - // make a first pass through to build the mutation_map so that - // mutation parent can be remapped even if the table is not in order + // Mutations and sites + // Make a first pass through to build the mutation_map so that + // mutation parent can be remapped even if the table is not in order. j = 0; for (k = 0; k < (tsk_id_t) tables.mutations.num_rows; k++) { if (node_map[tables.mutations.node[k]] != TSK_NULL) { @@ -9554,12 +9558,12 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, j++; site_id = tables.mutations.site[k]; if (site_map[site_id] == TSK_NULL) { - // insert a sentinel non-NULL value: + // Insert a temporary non-NULL value site_map[site_id] = 1; } } } - // keep retained sites in their original order + // Keep retained sites in their original order j = 0; for (k = 0; k < (tsk_id_t) tables.sites.num_rows; k++) { if (keep_unreferenced || site_map[k] != TSK_NULL) { @@ -9595,15 +9599,7 @@ tsk_table_collection_subset(tsk_table_collection_t *self, const tsk_id_t *nodes, } } - /* TODO: Subset of the Migrations Table. The way to do this properly is not - * well-defined, mostly because migrations might contain events from/to - * populations that have not been kept in after the subset. */ - if (tables.migrations.num_rows != 0) { - ret = TSK_ERR_MIGRATIONS_NOT_SUPPORTED; - goto out; - } ret = 0; - out: tsk_safe_free(node_map); tsk_safe_free(individual_map); diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 9a48b7b701..26580863b3 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -2660,10 +2660,10 @@ int tsk_table_collection_sort( @brief Puts the tables into canonical form. @rst -This method puts tables in canonical form, which exceeds the usual -sortedness requirements in such a way that randomly reshuffled tables are -guaranteed to always be sorted to the same order. In particular, this method -puts mutation parents before their children. +Put tables into canonical form such that randomly reshuffled tables +are guaranteed to always be sorted in the same order, and redundant +information is removed. The canonical sorting exceeds the usual +tree sequence sortedness requirements. **Options**: @@ -2765,7 +2765,7 @@ each of the tables as follows (but see options, below): 1. Nodes: if in the list of nodes, and in the order provided. 2. Individuals and Populations: if referred to by a retained node, and in the - order first seen when traversing the list of retained nodes. + order first seen when traversing the list of retained nodes. 3. Edges: if both parent and child are retained nodes. 4. Mutations: if the mutation's node is a retained node. 5. Sites: if any mutations remain at the site after removing mutations. diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 149c08012a..7f6fab8b0f 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -3582,7 +3582,7 @@ def test_examples(self): self.verify_union_equality(*self.split_example(ts, T)) -class TestSubsetUnion(unittest.TestCase): +class TestSubsetUnion: # Check that we can remove a single individual from a tree sequence and # then use union to put it back in again. This is a good test of union, # subset, and various sort options. diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 853e0944d7..7f7186e741 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -871,7 +871,7 @@ def shuffle_tables( ind_id_map = {tskit.NULL: tskit.NULL} for j, i in randomised_inds: ind_id_map[j] = tables.individuals.add_row( - flags=i.flags, location=i.location, metadata=i.metadata + flags=i.flags, location=i.location, parents=i.parents, metadata=i.metadata ) # nodes (same order, but remapped populations and individuals) for n in orig.nodes: diff --git a/python/tskit/tables.py b/python/tskit/tables.py index f62881b42d..defa4483ae 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2692,7 +2692,7 @@ def sort(self, edge_start=0): self._ll_tables.sort(edge_start) # TODO add provenance - def canonicalise(self, remove_unreferenced=True): + def canonicalise(self, remove_unreferenced=None): """ This puts the tables in *canonical* form - to do this, the individual and population tables are sorted by the first node that refers to each @@ -2704,10 +2704,10 @@ def canonicalise(self, remove_unreferenced=True): information should be identical after canonical sorting. By default, the method removes sites, individuals, and populations that - are not referenced (by mutations and nodes, respectively). If it is - desired to keep these, pass ``remove_unreferenced=False``, but note that - unreferenced individuals and populations are put at the end of the tables, - but not in canonical order. + are not referenced (by mutations and nodes, respectively). If you wish + to keep these, pass ``remove_unreferenced=False``, but note that + unreferenced individuals and populations are put at the end of the tables + in their original order. .. seealso:: @@ -2715,8 +2715,11 @@ def canonicalise(self, remove_unreferenced=True): :meth:`.subset` for reordering nodes, individuals, and populations. :param bool remove_unreferenced: Whether to remove unreferenced sites, - individuals, and populations. + individuals, and populations (default=True). """ + remove_unreferenced = ( + True if remove_unreferenced is None else remove_unreferenced + ) self._ll_tables.canonicalise(remove_unreferenced=remove_unreferenced) # TODO add provenance @@ -2768,7 +2771,7 @@ def deduplicate_sites(self): site), and renumbering the ``site`` column of the mutation table appropriately. This requires the site table to be sorted by position. - ..warning:: This method does not sort the tables afterwards, so + .. warning:: This method does not sort the tables afterwards, so mutations may no longer be sorted by time. """ self._ll_tables.deduplicate_sites() @@ -3050,8 +3053,9 @@ def subset( self, nodes, record_provenance=True, - reorder_populations=True, - remove_unreferenced=True, + *, + reorder_populations=None, + remove_unreferenced=None, ): """ Modifies the tables in place to contain only the entries referring to @@ -3072,6 +3076,12 @@ def subset( that are not referred to by any retained entries in the tables should be removed (default: True). See the description for details. """ + reorder_populations = ( + True if reorder_populations is None else reorder_populations + ) + remove_unreferenced = ( + True if remove_unreferenced is None else remove_unreferenced + ) nodes = util.safe_np_int_cast(nodes, np.int32) self._ll_tables.subset( nodes,