From cde024a5b25ec679069139a334dd2cab0983e839 Mon Sep 17 00:00:00 2001 From: Ben Jeffery Date: Fri, 5 Nov 2021 13:02:36 +0000 Subject: [PATCH] Add discrete time --- c/tests/test_trees.c | 76 ++++++++++++++++++++++++++++++++++ c/tests/testlib.c | 64 ++++++++++++++++++++++++++++ c/tskit/trees.c | 38 ++++++++++++++++- c/tskit/trees.h | 3 ++ python/CHANGELOG.rst | 4 ++ python/_tskitmodule.c | 17 ++++++++ python/tests/test_highlevel.py | 13 ++++++ python/tests/test_lowlevel.py | 14 +++++++ python/tskit/trees.py | 17 ++++++++ 9 files changed, 245 insertions(+), 1 deletion(-) diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 0ed977392c..1b5febb9d7 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -1022,6 +1022,81 @@ test_simplest_discrete_genome(void) tsk_table_collection_free(&tables); } +static void +test_simplest_discrete_time(void) +{ + int ret; + tsk_treeseq_t ts; + tsk_table_collection_t tables; + const char *nodes = "1 0 0\n" + "1 0 0\n" + "0 1 0\n" + "0 0 0\n" + "0 0 0"; + const char *edges = "0 1 2 0,1,3,4\n"; + const char *sites = "0.1 0\n" + "0.2 0\n" + "0.3 0\n" + "0.4 0\n"; + const char *mutations = "0 0 1\n" + "1 1 1\n" + "2 3 1\n" + "3 4 1"; + const char *migrations = "0 1 0 0 1 1"; + + tsk_treeseq_from_text( + &ts, 1, nodes, edges, migrations, sites, mutations, NULL, NULL, 0); + CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); + + ret = tsk_table_collection_copy(ts.tables, &tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); + + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + + tables.nodes.time[0] = 0.0001; + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + tables.nodes.time[0] = 0; + + tables.mutations.time[0] = 0.001; + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + tables.mutations.time[0] = 0; + + tables.migrations.time[0] = 0.001; + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FALSE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + tables.migrations.time[0] = 0; + + tables.mutations.time[0] = TSK_UNKNOWN_TIME; + tables.mutations.time[1] = TSK_UNKNOWN_TIME; + tables.mutations.time[2] = TSK_UNKNOWN_TIME; + tables.mutations.time[3] = TSK_UNKNOWN_TIME; + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + + /* An empty tree sequence is has a discrete time. */ + tsk_table_collection_clear(&tables, 0); + ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_treeseq_get_discrete_time(&ts)); + tsk_treeseq_free(&ts); + + tsk_table_collection_free(&tables); +} + static void test_simplest_records(void) { @@ -6746,6 +6821,7 @@ main(int argc, char **argv) CU_TestInfo tests[] = { /* simplest example tests */ { "test_simplest_discrete_genome", test_simplest_discrete_genome }, + { "test_simplest_discrete_time", test_simplest_discrete_time }, { "test_simplest_records", test_simplest_records }, { "test_simplest_nonbinary_records", test_simplest_nonbinary_records }, { "test_simplest_unary_records", test_simplest_unary_records }, diff --git a/c/tests/testlib.c b/c/tests/testlib.c index d966b3cfd6..7e4947501a 100644 --- a/c/tests/testlib.c +++ b/c/tests/testlib.c @@ -493,6 +493,63 @@ parse_edges(const char *text, tsk_edge_table_t *edge_table) } } +void +parse_migrations(const char *text, tsk_migration_table_t *migration_table) +{ + tsk_id_t ret_id; + size_t c, k; + size_t MAX_LINE = 1024; + char line[MAX_LINE]; + const char *whitespace = " \t"; + char *p; + double left, right, time; + int node, source, dest; + char *metadata; + + c = 0; + while (text[c] != '\0') { + /* Fill in the line */ + k = 0; + while (text[c] != '\n' && text[c] != '\0') { + CU_ASSERT_FATAL(k < MAX_LINE - 1); + line[k] = text[c]; + c++; + k++; + } + if (text[c] == '\n') { + c++; + } + line[k] = '\0'; + p = strtok(line, whitespace); + CU_ASSERT_FATAL(p != NULL); + left = atof(p); + p = strtok(NULL, whitespace); + CU_ASSERT_FATAL(p != NULL); + right = atof(p); + p = strtok(NULL, whitespace); + CU_ASSERT_FATAL(p != NULL); + node = atoi(p); + p = strtok(NULL, whitespace); + CU_ASSERT_FATAL(p != NULL); + source = atoi(p); + p = strtok(NULL, whitespace); + CU_ASSERT_FATAL(p != NULL); + dest = atoi(p); + p = strtok(NULL, whitespace); + CU_ASSERT_FATAL(p != NULL); + time = atof(p); + p = strtok(NULL, whitespace); + if (p == NULL) { + metadata = ""; + } else { + metadata = p; + } + ret_id = tsk_migration_table_add_row(migration_table, left, right, node, source, + dest, time, metadata, strlen(metadata)); + CU_ASSERT_FATAL(ret_id >= 0); + } +} + void parse_sites(const char *text, tsk_site_table_t *site_table) { @@ -706,11 +763,18 @@ tsk_treeseq_from_text(tsk_treeseq_t *ts, double sequence_length, const char *nod if (individuals != NULL) { parse_individuals(individuals, &tables.individuals); } + if (migrations != NULL) { + parse_migrations(migrations, &tables.migrations); + } /* We need to add in populations if they are referenced */ max_population_id = -1; for (j = 0; j < tables.nodes.num_rows; j++) { max_population_id = TSK_MAX(max_population_id, tables.nodes.population[j]); } + for (j = 0; j < tables.migrations.num_rows; j++) { + max_population_id = TSK_MAX(max_population_id, tables.migrations.source[j]); + max_population_id = TSK_MAX(max_population_id, tables.migrations.dest[j]); + } if (max_population_id >= 0) { for (j = 0; j <= (tsk_size_t) max_population_id; j++) { ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0); diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 7e79d44367..0e6f487698 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -315,13 +315,33 @@ tsk_treeseq_init_migrations(tsk_treeseq_t *self) tsk_size_t num_migrations = self->tables->migrations.num_rows; const double *restrict left = self->tables->migrations.left; const double *restrict right = self->tables->migrations.right; + const double *restrict time = self->tables->migrations.time; bool discrete_breakpoints = true; + bool discrete_times = true; for (j = 0; j < num_migrations; j++) { discrete_breakpoints = discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]); + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); } self->discrete_genome = self->discrete_genome && discrete_breakpoints; + self->discrete_time = self->discrete_time && discrete_times; +} + +static void +tsk_treeseq_init_mutations(tsk_treeseq_t *self) +{ + tsk_size_t j; + tsk_size_t num_mutations = self->tables->mutations.num_rows; + const double *restrict time = self->tables->mutations.time; + bool discrete_times = true; + + for (j = 0; j < num_mutations; j++) { + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); + } + self->discrete_time = self->discrete_time && discrete_times; } static int @@ -330,7 +350,9 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) tsk_size_t j, k; tsk_size_t num_nodes = self->tables->nodes.num_rows; const tsk_flags_t *restrict node_flags = self->tables->nodes.flags; + const double *restrict time = self->tables->nodes.time; int ret = 0; + bool discrete_times = true; /* Determine the sample size */ self->num_samples = 0; @@ -356,6 +378,12 @@ tsk_treeseq_init_nodes(tsk_treeseq_t *self) } } tsk_bug_assert(k == self->num_samples); + + for (j = 0; j < num_nodes; j++) { + discrete_times + = discrete_times && (is_discrete(time[j]) || tsk_is_unknown_time(time[j])); + } + self->discrete_time = self->discrete_time && discrete_times; out: return ret; } @@ -426,11 +454,12 @@ tsk_treeseq_init( tsk_memcpy(self->tables->file_uuid, tables->file_uuid, TSK_UUID_SIZE + 1); } + self->discrete_genome = true; + self->discrete_time = true; ret = tsk_treeseq_init_nodes(self); if (ret != 0) { goto out; } - self->discrete_genome = true; ret = tsk_treeseq_init_sites(self); if (ret != 0) { goto out; @@ -444,6 +473,7 @@ tsk_treeseq_init( goto out; } tsk_treeseq_init_migrations(self); + tsk_treeseq_init_mutations(self); if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED) && !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED, @@ -669,6 +699,12 @@ tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self) return self->discrete_genome; } +bool +tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self) +{ + return self->discrete_time; +} + /* Stats functions */ #define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row)) diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 0a01e516cd..ea5f2188f0 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -70,6 +70,8 @@ typedef struct { bool time_uncalibrated; /* Are all genome coordinates discrete? */ bool discrete_genome; + /* Are all time values discrete? */ + bool discrete_time; /* Breakpoints along the sequence, including 0 and L. */ double *breakpoints; /* If a node is a sample, map to its index in the samples list */ @@ -266,6 +268,7 @@ const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self); const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self); bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u); bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self); +bool tsk_treeseq_get_discrete_time(const tsk_treeseq_t *self); int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node); int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge); diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 38e6ff5aee..a424a55d37 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -86,6 +86,9 @@ - Add the ``TreeSequence.alignments`` method. (user:`jeromekelleher`, :pr:`1825`) +- Add the ``discrete_time`` property to the TreeSequence class which is true if + all time coordinates are discrete or unknown (:user:`benjeffery`, :issue:`1839`, :pr:`1890`) + **Fixes** - `dump_tables` omitted individual parents. (:user:`benjeffery`, :issue:`1828`, :pr:`1884`) @@ -101,6 +104,7 @@ (:user:`jeetsukumaran`, :user:`jeromekelleher`, :issue:`1785`, :pr:`1835`, :pr:`1836`, :pr:`1838`) + -------------------- [0.3.7] - 2021-07-08 -------------------- diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 0b27ee146a..0548653084 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -7830,6 +7830,19 @@ TreeSequence_get_discrete_genome(TreeSequence *self) return ret; } +static PyObject * +TreeSequence_get_discrete_time(TreeSequence *self) +{ + PyObject *ret = NULL; + + if (TreeSequence_check_state(self) != 0) { + goto out; + } + ret = Py_BuildValue("i", tsk_treeseq_get_discrete_time(self->tree_sequence)); +out: + return ret; +} + static PyObject * TreeSequence_get_breakpoints(TreeSequence *self) { @@ -9039,6 +9052,10 @@ static PyMethodDef TreeSequence_methods[] = { .ml_meth = (PyCFunction) TreeSequence_get_discrete_genome, .ml_flags = METH_NOARGS, .ml_doc = "Returns True if this TreeSequence has discrete coordinates" }, + { .ml_name = "get_discrete_time", + .ml_meth = (PyCFunction) TreeSequence_get_discrete_time, + .ml_flags = METH_NOARGS, + .ml_doc = "Returns True if this TreeSequence has discrete times" }, { .ml_name = "get_breakpoints", .ml_meth = (PyCFunction) TreeSequence_get_breakpoints, .ml_flags = METH_NOARGS, diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 21302cafb8..1dba6d5ab7 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -1270,6 +1270,19 @@ def is_discrete(a): ) assert ts.discrete_genome == discrete_genome + @pytest.mark.parametrize("ts", get_example_tree_sequences()) + def test_discrete_time(self, ts): + def is_discrete(a): + return np.all(np.logical_or(np.floor(a) == a, tskit.is_unknown_time(a))) + + tables = ts.tables + discrete_time = ( + is_discrete(tables.nodes.time) + and is_discrete(tables.mutations.time) + and is_discrete(tables.migrations.time) + ) + assert ts.discrete_time == discrete_time + def test_trees(self): for ts in get_example_tree_sequences(): self.verify_trees(ts) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index df7454e668..b0c56131ba 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -1412,6 +1412,20 @@ def test_clear_table(self, ts_fixture): assert tables.sequence_length == ts_fixture.tables.sequence_length assert tables.time_units == ts_fixture.tables.time_units + def test_discrete_genome(self): + tables = _tskit.TableCollection(1) + tables.build_index() + ts = _tskit.TreeSequence() + ts.load_tables(tables) + assert ts.get_discrete_genome() == 1 + + def test_discrete_time(self): + tables = _tskit.TableCollection(1) + tables.build_index() + ts = _tskit.TreeSequence() + ts.load_tables(tables) + assert ts.get_discrete_time() == 1 + class StatsInterfaceMixin: """ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 802503c6e6..6e367bf7a8 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -3950,6 +3950,23 @@ def discrete_genome(self): """ return bool(self._ll_tree_sequence.get_discrete_genome()) + @property + def discrete_time(self): + """ + Returns True if all time coordinates in this TreeSequence are + discrete integer values. This is true iff all the following are true: + + - All node times are discrete + - All mutation times are discrete + - All migration times are discrete + + Note that ``tskit.UNKNOWN_TIME`` counts as discrete. + + :return: True if this TreeSequence uses discrete time coordinates. + :rtype: bool + """ + return bool(self._ll_tree_sequence.get_discrete_time()) + @property def sequence_length(self): """