Skip to content

Commit 3ba31d4

Browse files
Implement TreeSequence.discrete_genome
Closes #1144
1 parent 9f25482 commit 3ba31d4

File tree

7 files changed

+192
-0
lines changed

7 files changed

+192
-0
lines changed

c/tests/test_trees.c

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,104 @@ verify_empty_tree_sequence(tsk_treeseq_t *ts, double sequence_length)
925925
* Simplest test cases.
926926
*======================================================*/
927927

928+
static void
929+
test_simplest_discrete_genome(void)
930+
{
931+
const char *nodes = "1 0 0\n"
932+
"1 0 0\n"
933+
"0 1 0";
934+
const char *edges = "0 1 2 0,1\n";
935+
tsk_treeseq_t ts;
936+
tsk_table_collection_t tables;
937+
tsk_id_t ret_id;
938+
int ret;
939+
940+
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);
941+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
942+
943+
ret = tsk_table_collection_copy(ts.tables, &tables, 0);
944+
CU_ASSERT_EQUAL_FATAL(ret, 0);
945+
tsk_treeseq_free(&ts);
946+
947+
tables.sequence_length = 1.001;
948+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
949+
CU_ASSERT_EQUAL_FATAL(ret, 0);
950+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
951+
tsk_treeseq_free(&ts);
952+
tables.sequence_length = 1;
953+
954+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
955+
CU_ASSERT_EQUAL_FATAL(ret, 0);
956+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
957+
tsk_treeseq_free(&ts);
958+
959+
tables.edges.right[0] = 0.999;
960+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
961+
CU_ASSERT_EQUAL_FATAL(ret, 0);
962+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
963+
tsk_treeseq_free(&ts);
964+
tables.edges.right[0] = 1.0;
965+
966+
tables.edges.left[0] = 0.999;
967+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
968+
CU_ASSERT_EQUAL_FATAL(ret, 0);
969+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
970+
tsk_treeseq_free(&ts);
971+
tables.edges.left[0] = 0;
972+
973+
ret_id = tsk_site_table_add_row(&tables.sites, 0, "A", 1, NULL, 0);
974+
CU_ASSERT_EQUAL_FATAL(ret_id, 0);
975+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
976+
CU_ASSERT_EQUAL_FATAL(ret, 0);
977+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
978+
tsk_treeseq_free(&ts);
979+
980+
tables.sites.position[0] = 0.001;
981+
CU_ASSERT_EQUAL_FATAL(ret, 0);
982+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
983+
CU_ASSERT_EQUAL_FATAL(ret, 0);
984+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
985+
tsk_treeseq_free(&ts);
986+
tables.sites.position[0] = 0;
987+
988+
/* Need another population for a migration */
989+
ret_id = tsk_population_table_add_row(&tables.populations, NULL, 0);
990+
CU_ASSERT_EQUAL_FATAL(ret_id, 1);
991+
992+
ret_id
993+
= tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0);
994+
CU_ASSERT_EQUAL_FATAL(ret_id, 0);
995+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
996+
CU_ASSERT_EQUAL_FATAL(ret, 0);
997+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
998+
tsk_treeseq_free(&ts);
999+
1000+
tables.migrations.left[0] = 0.001;
1001+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1002+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1003+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1004+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
1005+
tsk_treeseq_free(&ts);
1006+
tables.migrations.left[0] = 0;
1007+
1008+
tables.migrations.right[0] = 0.999;
1009+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1010+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1011+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1012+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
1013+
tsk_treeseq_free(&ts);
1014+
tables.migrations.right[0] = 1;
1015+
1016+
/* An empty tree sequence is has a discrete genome. */
1017+
tsk_table_collection_clear(&tables, 0);
1018+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1019+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1020+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
1021+
tsk_treeseq_free(&ts);
1022+
1023+
tsk_table_collection_free(&tables);
1024+
}
1025+
9281026
static void
9291027
test_simplest_records(void)
9301028
{
@@ -6520,6 +6618,7 @@ main(int argc, char **argv)
65206618
{
65216619
CU_TestInfo tests[] = {
65226620
/* simplest example tests */
6621+
{ "test_simplest_discrete_genome", test_simplest_discrete_genome },
65236622
{ "test_simplest_records", test_simplest_records },
65246623
{ "test_simplest_nonbinary_records", test_simplest_nonbinary_records },
65256624
{ "test_simplest_unary_records", test_simplest_unary_records },

c/tskit/trees.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232

3333
#include <tskit/trees.h>
3434

35+
static inline bool
36+
is_discrete(double x)
37+
{
38+
return trunc(x) == x;
39+
}
40+
3541
/* ======================================================== *
3642
* tree sequence
3743
* ======================================================== */
@@ -127,6 +133,8 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
127133
const tsk_size_t num_mutations = self->tables->mutations.num_rows;
128134
const tsk_size_t num_sites = self->tables->sites.num_rows;
129135
const tsk_id_t *restrict mutation_site = self->tables->mutations.site;
136+
const double *restrict site_position = self->tables->sites.position;
137+
bool discrete_sites = true;
130138

131139
self->site_mutations_mem
132140
= tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem));
@@ -148,6 +156,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
148156
}
149157
k = 0;
150158
for (j = 0; j < (tsk_id_t) num_sites; j++) {
159+
discrete_sites = discrete_sites && is_discrete(site_position[j]);
151160
self->site_mutations[j] = self->site_mutations_mem + offset;
152161
self->site_mutations_length[j] = 0;
153162
/* Go through all mutations for this site */
@@ -161,6 +170,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
161170
goto out;
162171
}
163172
}
173+
self->discrete_genome = self->discrete_genome && discrete_sites;
164174
out:
165175
return ret;
166176
}
@@ -243,6 +253,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
243253
const double *restrict edge_right = self->tables->edges.right;
244254
const double *restrict edge_left = self->tables->edges.left;
245255
tsk_size_t num_trees_alloc = self->num_trees + 1;
256+
bool discrete_breakpoints = true;
246257

247258
self->tree_sites_length
248259
= tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length));
@@ -264,6 +275,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
264275
j = 0;
265276
k = 0;
266277
while (j < num_edges || tree_left < sequence_length) {
278+
discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left);
267279
self->breakpoints[tree_index] = tree_left;
268280
while (k < num_edges && edge_right[O[k]] == tree_left) {
269281
k++;
@@ -289,11 +301,29 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
289301
tsk_bug_assert(site == num_sites);
290302
tsk_bug_assert(tree_index == self->num_trees);
291303
self->breakpoints[tree_index] = tree_right;
304+
discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right);
305+
self->discrete_genome = self->discrete_genome && discrete_breakpoints;
292306
ret = 0;
293307
out:
294308
return ret;
295309
}
296310

311+
static void
312+
tsk_treeseq_init_migrations(tsk_treeseq_t *self)
313+
{
314+
tsk_size_t j;
315+
tsk_size_t num_migrations = self->tables->migrations.num_rows;
316+
const double *restrict left = self->tables->migrations.left;
317+
const double *restrict right = self->tables->migrations.right;
318+
bool discrete_breakpoints = true;
319+
320+
for (j = 0; j < num_migrations; j++) {
321+
discrete_breakpoints
322+
= discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]);
323+
}
324+
self->discrete_genome = self->discrete_genome && discrete_breakpoints;
325+
}
326+
297327
static int
298328
tsk_treeseq_init_nodes(tsk_treeseq_t *self)
299329
{
@@ -400,6 +430,7 @@ tsk_treeseq_init(
400430
if (ret != 0) {
401431
goto out;
402432
}
433+
self->discrete_genome = true;
403434
ret = tsk_treeseq_init_sites(self);
404435
if (ret != 0) {
405436
goto out;
@@ -412,6 +443,8 @@ tsk_treeseq_init(
412443
if (ret != 0) {
413444
goto out;
414445
}
446+
tsk_treeseq_init_migrations(self);
447+
415448
if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED)
416449
&& !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED,
417450
strlen(TSK_TIME_UNITS_UNCALIBRATED))) {
@@ -630,6 +663,12 @@ tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u)
630663
return ret;
631664
}
632665

666+
bool
667+
tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self)
668+
{
669+
return self->discrete_genome;
670+
}
671+
633672
/* Stats functions */
634673

635674
#define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row))

c/tskit/trees.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ typedef struct {
6868
tsk_id_t *samples;
6969
/* Does this tree sequence have time_units == "uncalibrated" */
7070
bool time_uncalibrated;
71+
/* Are all genome coordinates discrete? */
72+
bool discrete_genome;
7173
/* Breakpoints along the sequence, including 0 and L. */
7274
double *breakpoints;
7375
/* If a node is a sample, map to its index in the samples list */
@@ -268,6 +270,7 @@ const double *tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self);
268270
const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self);
269271
const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self);
270272
bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u);
273+
bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self);
271274

272275
int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node);
273276
int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge);

python/CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
Roughly a 10X performance increase for "preorder", "postorder", "timeasc"
5555
and "timedesc" (:user:`jeromekelleher`, :pr:`1704`).
5656

57+
- Add the ``discrete_genome`` property to the TreeSequence class which is true if
58+
all coordinates are discrete (:user:`jeromekelleher`, :issue:`1144`, :pr:`1819`)
5759

5860
--------------------
5961
[0.3.7] - 2021-07-08

python/_tskitmodule.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7788,6 +7788,19 @@ TreeSequence_get_sequence_length(TreeSequence *self)
77887788
return ret;
77897789
}
77907790

7791+
static PyObject *
7792+
TreeSequence_get_discrete_genome(TreeSequence *self)
7793+
{
7794+
PyObject *ret = NULL;
7795+
7796+
if (TreeSequence_check_state(self) != 0) {
7797+
goto out;
7798+
}
7799+
ret = Py_BuildValue("i", tsk_treeseq_get_discrete_genome(self->tree_sequence));
7800+
out:
7801+
return ret;
7802+
}
7803+
77917804
static PyObject *
77927805
TreeSequence_get_breakpoints(TreeSequence *self)
77937806
{
@@ -8993,6 +9006,10 @@ static PyMethodDef TreeSequence_methods[] = {
89939006
.ml_meth = (PyCFunction) TreeSequence_get_sequence_length,
89949007
.ml_flags = METH_NOARGS,
89959008
.ml_doc = "Returns the sequence length in bases." },
9009+
{ .ml_name = "get_discrete_genome",
9010+
.ml_meth = (PyCFunction) TreeSequence_get_discrete_genome,
9011+
.ml_flags = METH_NOARGS,
9012+
.ml_doc = "Returns True if this TreeSequence has discrete coordinates" },
89969013
{ .ml_name = "get_breakpoints",
89979014
.ml_meth = (PyCFunction) TreeSequence_get_breakpoints,
89989015
.ml_flags = METH_NOARGS,

python/tests/test_highlevel.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,22 @@ class TestTreeSequence(HighLevelTestCase):
12411241
Tests for the tree sequence object.
12421242
"""
12431243

1244+
@pytest.mark.parametrize("ts", get_example_tree_sequences())
1245+
def test_discrete_genome(self, ts):
1246+
def is_discrete(a):
1247+
return np.all(np.floor(a) == a)
1248+
1249+
tables = ts.tables
1250+
discrete_genome = (
1251+
is_discrete([tables.sequence_length])
1252+
and is_discrete(tables.edges.left)
1253+
and is_discrete(tables.edges.right)
1254+
and is_discrete(tables.sites.position)
1255+
and is_discrete(tables.migrations.left)
1256+
and is_discrete(tables.migrations.right)
1257+
)
1258+
assert ts.discrete_genome == discrete_genome
1259+
12441260
def test_trees(self):
12451261
for ts in get_example_tree_sequences():
12461262
self.verify_trees(ts)

python/tskit/trees.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3842,6 +3842,22 @@ def get_sample_size(self):
38423842
def file_uuid(self):
38433843
return self._ll_tree_sequence.get_file_uuid()
38443844

3845+
@property
3846+
def discrete_genome(self):
3847+
"""
3848+
Returns True if all genome coordinates in this TreeSequence are
3849+
discrete integer values. This is true iff all the following are true:
3850+
3851+
- The sequence length is discrete
3852+
- All site positions are discrete
3853+
- All left and right edge coordinates are discrete
3854+
- All migration left and right coordinates are discrete
3855+
3856+
:return: True if this TreeSequence uses discrete genome coordinates.
3857+
:rtype: bool
3858+
"""
3859+
return bool(self._ll_tree_sequence.get_discrete_genome())
3860+
38453861
@property
38463862
def sequence_length(self):
38473863
"""

0 commit comments

Comments
 (0)