Skip to content

Commit dc5e308

Browse files
Implement TreeSequence.discrete_genome
Closes #1144
1 parent 9f25482 commit dc5e308

File tree

7 files changed

+190
-0
lines changed

7 files changed

+190
-0
lines changed

c/tests/test_trees.c

Lines changed: 97 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -925,6 +925,102 @@ verify_empty_tree_sequence(tsk_treeseq_t *ts, double sequence_length)
925925
* Simplest test cases.
926926
*======================================================*/
927927

928+
static void
929+
test_simplest_discrete_genome(void)
930+
{
931+
const char *nodes = "1 0 0\n"
932+
"1 0 0\n"
933+
"0 1 0";
934+
const char *edges = "0 1 2 0,1\n";
935+
tsk_treeseq_t ts;
936+
tsk_table_collection_t tables;
937+
int ret;
938+
939+
tsk_treeseq_from_text(&ts, 1, nodes, edges, NULL, NULL, NULL, NULL, NULL, 0);
940+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
941+
942+
ret = tsk_table_collection_copy(ts.tables, &tables, 0);
943+
CU_ASSERT_EQUAL_FATAL(ret, 0);
944+
tsk_treeseq_free(&ts);
945+
946+
tables.sequence_length = 1.001;
947+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
948+
CU_ASSERT_EQUAL_FATAL(ret, 0);
949+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
950+
tsk_treeseq_free(&ts);
951+
tables.sequence_length = 1;
952+
953+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
954+
CU_ASSERT_EQUAL_FATAL(ret, 0);
955+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
956+
tsk_treeseq_free(&ts);
957+
958+
tables.edges.right[0] = 0.999;
959+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
960+
CU_ASSERT_EQUAL_FATAL(ret, 0);
961+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
962+
tsk_treeseq_free(&ts);
963+
tables.edges.right[0] = 1.0;
964+
965+
tables.edges.left[0] = 0.999;
966+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
967+
CU_ASSERT_EQUAL_FATAL(ret, 0);
968+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
969+
tsk_treeseq_free(&ts);
970+
tables.edges.left[0] = 0;
971+
972+
ret = tsk_site_table_add_row(&tables.sites, 0, "A", 1, NULL, 0);
973+
CU_ASSERT_EQUAL_FATAL(ret, 0);
974+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
975+
CU_ASSERT_EQUAL_FATAL(ret, 0);
976+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
977+
tsk_treeseq_free(&ts);
978+
979+
tables.sites.position[0] = 0.001;
980+
CU_ASSERT_EQUAL_FATAL(ret, 0);
981+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
982+
CU_ASSERT_EQUAL_FATAL(ret, 0);
983+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
984+
tsk_treeseq_free(&ts);
985+
tables.sites.position[0] = 0;
986+
987+
/* Need another population for a migration */
988+
ret = tsk_population_table_add_row(&tables.populations, NULL, 0);
989+
CU_ASSERT_EQUAL_FATAL(ret, 1);
990+
991+
ret = tsk_migration_table_add_row(&tables.migrations, 0, 1, 0, 0, 1, 1.0, NULL, 0);
992+
CU_ASSERT_EQUAL_FATAL(ret, 0);
993+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
994+
CU_ASSERT_EQUAL_FATAL(ret, 0);
995+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
996+
tsk_treeseq_free(&ts);
997+
998+
tables.migrations.left[0] = 0.001;
999+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1000+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1001+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1002+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
1003+
tsk_treeseq_free(&ts);
1004+
tables.migrations.left[0] = 0;
1005+
1006+
tables.migrations.right[0] = 0.999;
1007+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1008+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1009+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1010+
CU_ASSERT_FALSE(tsk_treeseq_get_discrete_genome(&ts));
1011+
tsk_treeseq_free(&ts);
1012+
tables.migrations.right[0] = 1;
1013+
1014+
/* An empty tree sequence is has a discrete genome. */
1015+
tsk_table_collection_clear(&tables, 0);
1016+
ret = tsk_treeseq_init(&ts, &tables, TSK_BUILD_INDEXES);
1017+
CU_ASSERT_EQUAL_FATAL(ret, 0);
1018+
CU_ASSERT_TRUE(tsk_treeseq_get_discrete_genome(&ts));
1019+
tsk_treeseq_free(&ts);
1020+
1021+
tsk_table_collection_free(&tables);
1022+
}
1023+
9281024
static void
9291025
test_simplest_records(void)
9301026
{
@@ -6520,6 +6616,7 @@ main(int argc, char **argv)
65206616
{
65216617
CU_TestInfo tests[] = {
65226618
/* simplest example tests */
6619+
{ "test_simplest_discrete_genome", test_simplest_discrete_genome },
65236620
{ "test_simplest_records", test_simplest_records },
65246621
{ "test_simplest_nonbinary_records", test_simplest_nonbinary_records },
65256622
{ "test_simplest_unary_records", test_simplest_unary_records },

c/tskit/trees.c

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,12 @@
3232

3333
#include <tskit/trees.h>
3434

35+
static inline bool
36+
is_discrete(double x)
37+
{
38+
return floor(x) == x;
39+
}
40+
3541
/* ======================================================== *
3642
* tree sequence
3743
* ======================================================== */
@@ -127,6 +133,8 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
127133
const tsk_size_t num_mutations = self->tables->mutations.num_rows;
128134
const tsk_size_t num_sites = self->tables->sites.num_rows;
129135
const tsk_id_t *restrict mutation_site = self->tables->mutations.site;
136+
const double *restrict site_position = self->tables->sites.position;
137+
bool discrete_sites = true;
130138

131139
self->site_mutations_mem
132140
= tsk_malloc(num_mutations * sizeof(*self->site_mutations_mem));
@@ -148,6 +156,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
148156
}
149157
k = 0;
150158
for (j = 0; j < (tsk_id_t) num_sites; j++) {
159+
discrete_sites = discrete_sites && is_discrete(site_position[j]);
151160
self->site_mutations[j] = self->site_mutations_mem + offset;
152161
self->site_mutations_length[j] = 0;
153162
/* Go through all mutations for this site */
@@ -161,6 +170,7 @@ tsk_treeseq_init_sites(tsk_treeseq_t *self)
161170
goto out;
162171
}
163172
}
173+
self->discrete_genome = self->discrete_genome && discrete_sites;
164174
out:
165175
return ret;
166176
}
@@ -243,6 +253,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
243253
const double *restrict edge_right = self->tables->edges.right;
244254
const double *restrict edge_left = self->tables->edges.left;
245255
tsk_size_t num_trees_alloc = self->num_trees + 1;
256+
bool discrete_breakpoints = true;
246257

247258
self->tree_sites_length
248259
= tsk_malloc(num_trees_alloc * sizeof(*self->tree_sites_length));
@@ -264,6 +275,7 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
264275
j = 0;
265276
k = 0;
266277
while (j < num_edges || tree_left < sequence_length) {
278+
discrete_breakpoints = discrete_breakpoints && is_discrete(tree_left);
267279
self->breakpoints[tree_index] = tree_left;
268280
while (k < num_edges && edge_right[O[k]] == tree_left) {
269281
k++;
@@ -289,11 +301,29 @@ tsk_treeseq_init_trees(tsk_treeseq_t *self)
289301
tsk_bug_assert(site == num_sites);
290302
tsk_bug_assert(tree_index == self->num_trees);
291303
self->breakpoints[tree_index] = tree_right;
304+
discrete_breakpoints = discrete_breakpoints && is_discrete(tree_right);
305+
self->discrete_genome = self->discrete_genome && discrete_breakpoints;
292306
ret = 0;
293307
out:
294308
return ret;
295309
}
296310

311+
static void
312+
tsk_treeseq_init_migrations(tsk_treeseq_t *self)
313+
{
314+
tsk_size_t j;
315+
tsk_size_t num_migrations = self->tables->migrations.num_rows;
316+
const double *restrict left = self->tables->migrations.left;
317+
const double *restrict right = self->tables->migrations.right;
318+
bool discrete_breakpoints = true;
319+
320+
for (j = 0; j < num_migrations; j++) {
321+
discrete_breakpoints
322+
= discrete_breakpoints && is_discrete(left[j]) && is_discrete(right[j]);
323+
}
324+
self->discrete_genome = self->discrete_genome && discrete_breakpoints;
325+
}
326+
297327
static int
298328
tsk_treeseq_init_nodes(tsk_treeseq_t *self)
299329
{
@@ -400,6 +430,7 @@ tsk_treeseq_init(
400430
if (ret != 0) {
401431
goto out;
402432
}
433+
self->discrete_genome = true;
403434
ret = tsk_treeseq_init_sites(self);
404435
if (ret != 0) {
405436
goto out;
@@ -412,6 +443,8 @@ tsk_treeseq_init(
412443
if (ret != 0) {
413444
goto out;
414445
}
446+
tsk_treeseq_init_migrations(self);
447+
415448
if (tsk_treeseq_get_time_units_length(self) == strlen(TSK_TIME_UNITS_UNCALIBRATED)
416449
&& !strncmp(tsk_treeseq_get_time_units(self), TSK_TIME_UNITS_UNCALIBRATED,
417450
strlen(TSK_TIME_UNITS_UNCALIBRATED))) {
@@ -630,6 +663,12 @@ tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u)
630663
return ret;
631664
}
632665

666+
bool
667+
tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self)
668+
{
669+
return self->discrete_genome;
670+
}
671+
633672
/* Stats functions */
634673

635674
#define GET_2D_ROW(array, row_len, row) (array + (((size_t)(row_len)) * (size_t) row))

c/tskit/trees.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,8 @@ typedef struct {
6868
tsk_id_t *samples;
6969
/* Does this tree sequence have time_units == "uncalibrated" */
7070
bool time_uncalibrated;
71+
/* Are all genome coordinates discrete? */
72+
bool discrete_genome;
7173
/* Breakpoints along the sequence, including 0 and L. */
7274
double *breakpoints;
7375
/* If a node is a sample, map to its index in the samples list */
@@ -268,6 +270,7 @@ const double *tsk_treeseq_get_breakpoints(const tsk_treeseq_t *self);
268270
const tsk_id_t *tsk_treeseq_get_samples(const tsk_treeseq_t *self);
269271
const tsk_id_t *tsk_treeseq_get_sample_index_map(const tsk_treeseq_t *self);
270272
bool tsk_treeseq_is_sample(const tsk_treeseq_t *self, tsk_id_t u);
273+
bool tsk_treeseq_get_discrete_genome(const tsk_treeseq_t *self);
271274

272275
int tsk_treeseq_get_node(const tsk_treeseq_t *self, tsk_id_t index, tsk_node_t *node);
273276
int tsk_treeseq_get_edge(const tsk_treeseq_t *self, tsk_id_t index, tsk_edge_t *edge);

python/CHANGELOG.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,8 @@
5454
Roughly a 10X performance increase for "preorder", "postorder", "timeasc"
5555
and "timedesc" (:user:`jeromekelleher`, :pr:`1704`).
5656

57+
- Add the ``discrete_genome`` property to the TreeSequence class which is true if
58+
all coordinates are discrete (:user:`jeromekelleher`, :issue:`1144`, :pr`fixme`)
5759

5860
--------------------
5961
[0.3.7] - 2021-07-08

python/_tskitmodule.c

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7788,6 +7788,19 @@ TreeSequence_get_sequence_length(TreeSequence *self)
77887788
return ret;
77897789
}
77907790

7791+
static PyObject *
7792+
TreeSequence_get_discrete_genome(TreeSequence *self)
7793+
{
7794+
PyObject *ret = NULL;
7795+
7796+
if (TreeSequence_check_state(self) != 0) {
7797+
goto out;
7798+
}
7799+
ret = Py_BuildValue("i", tsk_treeseq_get_discrete_genome(self->tree_sequence));
7800+
out:
7801+
return ret;
7802+
}
7803+
77917804
static PyObject *
77927805
TreeSequence_get_breakpoints(TreeSequence *self)
77937806
{
@@ -8993,6 +9006,10 @@ static PyMethodDef TreeSequence_methods[] = {
89939006
.ml_meth = (PyCFunction) TreeSequence_get_sequence_length,
89949007
.ml_flags = METH_NOARGS,
89959008
.ml_doc = "Returns the sequence length in bases." },
9009+
{ .ml_name = "get_discrete_genome",
9010+
.ml_meth = (PyCFunction) TreeSequence_get_discrete_genome,
9011+
.ml_flags = METH_NOARGS,
9012+
.ml_doc = "Returns True if this TreeSequence has discrete coordinates" },
89969013
{ .ml_name = "get_breakpoints",
89979014
.ml_meth = (PyCFunction) TreeSequence_get_breakpoints,
89989015
.ml_flags = METH_NOARGS,

python/tests/test_highlevel.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,6 +1241,22 @@ class TestTreeSequence(HighLevelTestCase):
12411241
Tests for the tree sequence object.
12421242
"""
12431243

1244+
@pytest.mark.parametrize("ts", get_example_tree_sequences())
1245+
def test_discrete_genome(self, ts):
1246+
def is_discrete(a):
1247+
return np.all(np.floor(a) == a)
1248+
1249+
tables = ts.tables
1250+
discrete_genome = (
1251+
is_discrete([tables.sequence_length])
1252+
and is_discrete([tables.edges.left])
1253+
and is_discrete([tables.edges.right])
1254+
and is_discrete([tables.sites.position])
1255+
and is_discrete([tables.migrations.left])
1256+
and is_discrete([tables.migrations.right])
1257+
)
1258+
assert ts.discrete_genome == discrete_genome
1259+
12441260
def test_trees(self):
12451261
for ts in get_example_tree_sequences():
12461262
self.verify_trees(ts)

python/tskit/trees.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3842,6 +3842,22 @@ def get_sample_size(self):
38423842
def file_uuid(self):
38433843
return self._ll_tree_sequence.get_file_uuid()
38443844

3845+
@property
3846+
def discrete_genome(self):
3847+
"""
3848+
Returns True if all genome coordinates in this TreeSequence are
3849+
discrete integer values. This is true iff all the following are true:
3850+
3851+
- The sequence length is discrete
3852+
- All site positions are discrete
3853+
- All left and right edge coordinates are discrete
3854+
- All migration left and right coordinates are discrete
3855+
3856+
:return: True if this TreeSequence uses discrete genome coordinates.
3857+
:rtype: bool
3858+
"""
3859+
return bool(self._ll_tree_sequence.get_discrete_genome())
3860+
38453861
@property
38463862
def sequence_length(self):
38473863
"""

0 commit comments

Comments
 (0)