Skip to content

Union of tree sequences #623

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
Jul 17, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions c/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ In development.

**New features**

- New methods to perform set operations on table collections.
``tsk_table_collection_subset`` subsets and reorders table collections by nodes
(:user:`mufernando`, :user:`petrelharp`, :pr:`663`, :pr:`690`).
``tsk_table_collection_union`` forms the node-wise union of two table collections
(:user:`mufernando`, :user:`petrelharp`, :issue:`381`, :pr:`623`).

- Mutations now have an optional double-precision floating-point ``time`` column.
If not specified, this defaults to a particular NaN value (``TSK_UNKNOWN_TIME``)
indicating that the time is unknown. For a tree sequence to be considered valid
Expand Down
259 changes: 250 additions & 9 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -3986,10 +3986,10 @@ test_table_collection_subset_with_options(tsk_flags_t options)
// four nodes from two diploids; the first is from pop 0
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
&tables.nodes, TSK_NODE_IS_SAMPLE, 2.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
Expand All @@ -4009,13 +4009,16 @@ test_table_collection_subset_with_options(tsk_flags_t options)
ret = tsk_site_table_add_row(&tables.sites, 0.4, "A", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, TSK_NULL, NAN, NULL, 0, NULL, 0);
&tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(&tables.mutations, 0, 0, 0, NAN, NULL, 0, NULL, 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, 0, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 1, 1, TSK_NULL, NAN, NULL, 0, NULL, 0);
&tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_build_index(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// empty nodes should get empty tables
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT | options);
Expand Down Expand Up @@ -4069,16 +4072,17 @@ test_table_collection_subset_errors(void)

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1;
ret = tsk_table_collection_init(&tables_copy, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// four nodes from two diploids; the first is from pop 0
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 1.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
&tables.nodes, TSK_NODE_IS_SAMPLE, 2.0, TSK_NULL, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(
&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, TSK_NULL, 1, NULL, 0);
Expand All @@ -4091,6 +4095,8 @@ test_table_collection_subset_errors(void)
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_build_index(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

/* Migrations are not supported */
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
Expand All @@ -4101,15 +4107,248 @@ test_table_collection_subset_errors(void)
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED);

// test out of bounds nodes
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
nodes[0] = -1;
ret = tsk_table_collection_subset(&tables, nodes, 4);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);
nodes[0] = 6;
ret = tsk_table_collection_subset(&tables, nodes, 4);
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_NODE_OUT_OF_BOUNDS);

// check integrity
nodes[0] = 0;
nodes[1] = 1;
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_node_table_truncate(&tables_copy.nodes, 3);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_node_table_add_row(
&tables_copy.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_subset(&tables_copy, nodes, 4);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS);

tsk_table_collection_free(&tables);
tsk_table_collection_free(&tables_copy);
}

static void
test_table_collection_union(void)
{
int ret;
tsk_table_collection_t tables;
tsk_table_collection_t tables_empty;
tsk_table_collection_t tables_copy;
tsk_id_t node_mapping[3];

memset(node_mapping, 0xff, sizeof(node_mapping));

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1;
ret = tsk_table_collection_init(&tables_empty, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables_empty.sequence_length = 1;
ret = tsk_table_collection_init(&tables_copy, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// does not error on empty tables
ret = tsk_table_collection_union(
&tables, &tables_empty, node_mapping, TSK_UNION_NO_CHECK_SHARED);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// three nodes, two pop, three ind, two edge, two site, two mut
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 1, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 2, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 2, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_site_table_add_row(&tables.sites, 0.4, "T", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 1, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_build_index(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_sort(&tables, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// union with empty should not change
// other is empty
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_union(
&tables_copy, &tables_empty, node_mapping, TSK_UNION_NO_CHECK_SHARED);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy));
// self is empty
ret = tsk_table_collection_clear(&tables_copy);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_union(
&tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy));

// union all shared nodes + subset original nodes = original table
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_union(
&tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED);
CU_ASSERT_EQUAL_FATAL(ret, 0);
node_mapping[0] = 0;
node_mapping[1] = 1;
node_mapping[2] = 2;
ret = tsk_table_collection_subset(&tables_copy, node_mapping, 3);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, &tables_copy));

// union with one shared node
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
node_mapping[0] = TSK_NULL;
node_mapping[1] = TSK_NULL;
node_mapping[2] = 2;
ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL_FATAL(
tables_copy.populations.num_rows, tables.populations.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(
tables_copy.individuals.num_rows, tables.individuals.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, tables.nodes.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.edges.num_rows, tables.edges.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, tables.sites.num_rows);
CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, tables.mutations.num_rows + 2);

// union with one shared node, but no add pop
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
node_mapping[0] = TSK_NULL;
node_mapping[1] = TSK_NULL;
node_mapping[2] = 2;
ret = tsk_table_collection_union(
&tables_copy, &tables, node_mapping, TSK_UNION_NO_ADD_POP);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.populations.num_rows, tables.populations.num_rows);
CU_ASSERT_EQUAL_FATAL(
tables_copy.individuals.num_rows, tables.individuals.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.nodes.num_rows, tables.nodes.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.edges.num_rows, tables.edges.num_rows + 2);
CU_ASSERT_EQUAL_FATAL(tables_copy.sites.num_rows, tables.sites.num_rows);
CU_ASSERT_EQUAL_FATAL(tables_copy.mutations.num_rows, tables.mutations.num_rows + 2);

tsk_table_collection_free(&tables_copy);
tsk_table_collection_free(&tables_empty);
tsk_table_collection_free(&tables);
}

static void
test_table_collection_union_errors(void)
{
int ret;
tsk_table_collection_t tables;
tsk_table_collection_t tables_copy;
tsk_id_t node_mapping[] = { 0, 1 };

ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1;
ret = tsk_table_collection_init(&tables_copy, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);

// two nodes, two pop, two ind, one edge, one site, one mut
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.5, 1, 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_individual_table_add_row(&tables.individuals, 0, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_edge_table_add_row(&tables.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_site_table_add_row(&tables.sites, 0.2, "A", 1, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_mutation_table_add_row(
&tables.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);

// trigger diff histories error
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_mutation_table_add_row(
&tables_copy.mutations, 0, 1, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_DIFF_HISTORIES);

// Migrations are not supported
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_migration_table_add_row(&tables_copy.migrations, 0, 1, 0, 0, 0, 0, NULL, 0);
CU_ASSERT_EQUAL_FATAL(tables_copy.migrations.num_rows, 1);
ret = tsk_table_collection_union(
&tables_copy, &tables, node_mapping, TSK_UNION_NO_CHECK_SHARED);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_MIGRATIONS_NOT_SUPPORTED);

// unsuported union - child shared parent not shared
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
node_mapping[0] = 0;
node_mapping[1] = TSK_NULL;
ret = tsk_table_collection_union(
&tables_copy, &tables, node_mapping, TSK_UNION_NO_ADD_POP);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_NOT_SUPPORTED);

// test out of bounds node_mapping
node_mapping[0] = -4;
node_mapping[1] = 6;
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNION_BAD_MAP);

// check integrity
node_mapping[0] = 0;
node_mapping[1] = 1;
ret = tsk_node_table_add_row(
&tables_copy.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_union(&tables_copy, &tables, node_mapping, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS);
ret = tsk_table_collection_copy(&tables, &tables_copy, TSK_NO_INIT);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_node_table_add_row(&tables.nodes, TSK_NODE_IS_SAMPLE, 0.0, -2, 0, NULL, 0);
CU_ASSERT_FATAL(ret >= 0);
ret = tsk_table_collection_union(&tables, &tables_copy, node_mapping, 0);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_POPULATION_OUT_OF_BOUNDS);

tsk_table_collection_free(&tables_copy);
tsk_table_collection_free(&tables);
}

int
Expand Down Expand Up @@ -4168,6 +4407,8 @@ main(int argc, char **argv)
test_table_collection_check_integrity_no_populations },
{ "test_table_collection_subset", test_table_collection_subset },
{ "test_table_collection_subset_errors", test_table_collection_subset_errors },
{ "test_table_collection_union", test_table_collection_union },
{ "test_table_collection_union_errors", test_table_collection_union_errors },
{ NULL, NULL },
};

Expand Down
14 changes: 14 additions & 0 deletions c/tskit/core.c
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,10 @@ tsk_strerror_internal(int err)
case TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED:
ret = "Only binary mutations are supported for this operation";
break;
case TSK_ERR_UNION_NOT_SUPPORTED:
ret = "Union is not supported for cases where there is non-shared"
"history older than the shared history of the two Table Collections";
break;

/* Stats errors */
case TSK_ERR_BAD_NUM_WINDOWS:
Expand Down Expand Up @@ -441,6 +445,16 @@ tsk_strerror_internal(int err)
case TSK_ERR_TOO_MANY_VALUES:
ret = "Too many values to compress";
break;

/* Union errors */
case TSK_ERR_UNION_BAD_MAP:
ret = "Node map contains an entry of a node not present in this table "
"collection.";
break;
case TSK_ERR_UNION_DIFF_HISTORIES:
// histories could be equivalent, because subset does not reorder
// edges (if not sorted) or mutations.
ret = "Shared portions of the tree sequences are not equal.";
}
return ret;
}
Expand Down
6 changes: 6 additions & 0 deletions c/tskit/core.h
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ not found in the file.
#define TSK_ERR_SORT_OFFSET_NOT_SUPPORTED -803
#define TSK_ERR_NONBINARY_MUTATIONS_UNSUPPORTED -804
#define TSK_ERR_MIGRATIONS_NOT_SUPPORTED -805
#define TSK_ERR_UNION_NOT_SUPPORTED -806

/* Stats errors */
#define TSK_ERR_BAD_NUM_WINDOWS -900
Expand Down Expand Up @@ -303,6 +304,11 @@ not found in the file.
#define TSK_ERR_MATCH_IMPOSSIBLE -1301
#define TSK_ERR_BAD_COMPRESSED_MATRIX_NODE -1302
#define TSK_ERR_TOO_MANY_VALUES -1303

/* Union errors */
#define TSK_ERR_UNION_BAD_MAP -1400
#define TSK_ERR_UNION_DIFF_HISTORIES -1401

// clang-format on

/* This bit is 0 for any errors originating from kastore */
Expand Down
Loading