From 1eb63e00d986983540e775afc74608b39bd602b7 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Thu, 8 Oct 2020 14:00:47 +0100 Subject: [PATCH 1/9] Add python function for covariance matrix --- python/tests/test_covariance.py | 288 ++++++++++++++++++++++++++++++++ 1 file changed, 288 insertions(+) create mode 100644 python/tests/test_covariance.py diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py new file mode 100644 index 0000000000..f62eb8aa77 --- /dev/null +++ b/python/tests/test_covariance.py @@ -0,0 +1,288 @@ +# MIT License +# +# Copyright (c) 2018-2020 Tskit Developers +# Copyright (c) 2016-2017 University of Oxford +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +""" +Test cases for covariance computation. +""" +import io +import itertools +import unittest + +import msprime +import numpy as np +import pytest + +import tests.tsutil as tsutil +import tskit + + +def check_cov_tree_inputs(tree): + if not len(tree.roots) == 1: + raise ValueError("Trees must have one root") + + for u in tree.nodes(): + if tree.num_children(u) == 1: + raise ValueError("Unary nodes are not supported") + + +def naive_tree_covariance(tree): + """ + Returns the (branch) covariance matrix for the sample nodes in a tree. The + covariance between a pair of nodes is the distance from the root to their + most recent common ancestor. + """ + samples = tree.tree_sequence.samples() + check_cov_tree_inputs(tree) + + n = samples.shape[0] + cov = np.zeros((n, n)) + for n1, n2 in itertools.combinations_with_replacement(range(n), 2): + mrca = tree.mrca(samples[n1], samples[n2]) + cov[n1, n2] = tree.time(tree.root) - tree.time(mrca) + cov[n2, n1] = cov[n1, n2] + + return cov + + +def naive_ts_covariance(ts): + """ + Returns the (branch) covariance matrix for the sample nodes in a tree + sequence. The covariance between a pair of nodes is the weighted sum of the + tree covariance, with the weights given by the spans of the trees. + """ + samples = ts.samples() + + n = samples.shape[0] + cov = np.zeros((n, n)) + for tree in ts.trees(): + cov += naive_tree_covariance(tree) * tree.span + + return cov / ts.sequence_length + + +def ts_covariance_incremental(ts): + for tree in ts.trees(): + check_cov_tree_inputs(tree) + + n = ts.num_samples + + this_cov = np.zeros((n, n)) + total_cov = np.zeros((n, n)) + last_update = np.zeros((n, n)) + + for tree, edge_diff in zip(ts.trees(sample_lists=True), ts.edge_diffs()): + (t_left, t_right), edges_out, edges_in = edge_diff + + for e in reversed(edges_out): + u = e.child + if tree.parent(u) == tskit.NULL: + for leaf in tree.leaves(u): + update_cov_pairs_with_leaf( + tree, total_cov, this_cov, last_update, leaf, t_left + ) + + for e in reversed(edges_in): + u = e.child + for leaf in tree.leaves(u): + update_cov_pairs_with_leaf( + tree, total_cov, this_cov, last_update, leaf, t_left + ) + + total_cov += this_cov * (t_right - last_update) + for (n1, n2) in itertools.combinations(range(n), 2): + total_cov[n2, n1] = total_cov[n1, n2] + return total_cov / ts.sequence_length + + +def update_cov_pairs_with_leaf(tree, total_cov, this_cov, last_update, leaf, t_left): + """ + Perform an upward traversal from `leaf` to the root, updating the covariance + matrix elements for pairs of `leaf` with every other leaf in the tree. + """ + root_time = tree.time(tree.root) + p = tree.parent(leaf) + c = leaf + + if tree.is_sample(c): + total_cov[c, c] += this_cov[c, c] * (t_left - last_update[c, c]) + last_update[c, c] = t_left + this_cov[c, c] = root_time - tree.time(c) + while p != -1: + time = root_time - tree.time(p) + for sibling in tree.children(p): + if sibling != c: + update_cov_all_pairs( + tree, total_cov, this_cov, last_update, leaf, sibling, time, t_left + ) + c, p = p, tree.parent(p) + + +def update_cov_all_pairs(tree, total_cov, this_cov, last_update, c1, c2, time, t_left): + s1_ind = tree.left_sample(c1) + while True: + s2_ind = tree.left_sample(c2) + while True: + (n1, n2) = (s1_ind, s2_ind) + if n1 > n2: + (n1, n2) = (n2, n1) + total_cov[n1, n2] += this_cov[n1, n2] * (t_left - last_update[n1, n2]) + last_update[n1, n2] = t_left + this_cov[n1, n2] = time + if s2_ind == tree.right_sample(c2): + break + s2_ind = tree.next_sample(s2_ind) + if s1_ind == tree.right_sample(c1): + break + s1_ind = tree.next_sample(s1_ind) + + +class TestCovariance(unittest.TestCase): + """ + Tests on covariance matrix computation + """ + + def verify(self, ts): + cov1 = naive_ts_covariance(ts) + cov2 = ts_covariance_incremental(ts) + assert np.allclose(cov1, cov2) + + def verify_errors(self, ts): + with pytest.raises(ValueError): + naive_ts_covariance(ts) + with pytest.raises(ValueError): + ts_covariance_incremental(ts) + + def test_errors_unary_nodes(self): + tables = tskit.TableCollection(sequence_length=2.0) + + sv = [True, False, False] + tv = [0.0, 1.0, 2.0] + for is_sample, t in zip(sv, tv): + flags = tskit.NODE_IS_SAMPLE if is_sample else 0 + tables.nodes.add_row(flags=flags, time=t) + + lv = [0.0, 0.0, 0.0] + rv = [1.0, 1.0, 1.0] + pv = [1, 2] + cv = [0, 1] + for left, right, p, c in zip(lv, rv, pv, cv): + tables.edges.add_row(left=left, right=right, parent=p, child=c) + + ts = tables.tree_sequence() + self.verify_errors(ts) + + def test_errors_multiroot_tree(self): + ts = msprime.simulate(15, random_seed=10) + ts = tsutil.decapitate(ts, ts.num_edges // 2) + self.verify_errors(ts) + + def test_single_coalescent_tree(self): + ts = msprime.simulate(10, random_seed=1, length=10) + self.verify(ts) + + def test_coalescent_trees(self): + ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2) + assert ts.num_trees > 2 + self.verify(ts) + + def test_internal_samples(self): + nodes = io.StringIO( + """\ + id is_sample time + 0 0 0 + 1 1 0.1 + 2 1 0.1 + 3 1 0.2 + 4 0 0.4 + 5 1 0.5 + 6 0 0.7 + 7 0 1.0 + 8 0 0.8 + """ + ) + edges = io.StringIO( + """\ + left right parent child + 0.0 0.2 4 2,3 + 0.2 0.8 4 0,2 + 0.8 1.0 4 2,3 + 0.0 1.0 5 1,4 + 0.8 1.0 6 0,5 + 0.2 0.8 8 3,5 + 0.0 0.2 7 0,5 + """ + ) + ts = tskit.load_text(nodes=nodes, edges=edges, strict=False) + self.verify(ts) + + def validate_trees(self, n): + for seed in range(1, 10): + ts = msprime.simulate(n, random_seed=seed, recombination_rate=1) + self.verify(ts) + + def test_sample_5(self): + self.validate_trees(5) + + def test_sample_10(self): + self.validate_trees(10) + + def test_sample_20(self): + self.validate_trees(20) + + def validate_nonbinary_trees(self, n): + demographic_events = [ + msprime.SimpleBottleneck(0.02, 0, proportion=0.25), + msprime.SimpleBottleneck(0.2, 0, proportion=1), + ] + + for seed in range(1, 10): + ts = msprime.simulate( + n, + random_seed=seed, + demographic_events=demographic_events, + recombination_rate=1, + ) + # Check if this is really nonbinary + found = False + for edgeset in ts.edgesets(): + if len(edgeset.children) > 2: + found = True + break + assert found + + self.verify(ts) + + def test_non_binary_sample_10(self): + self.validate_nonbinary_trees(10) + + def test_non_binary_sample_20(self): + self.validate_nonbinary_trees(20) + + def test_permit_internal_samples(self): + tables = tskit.TableCollection(1.0) + tables.nodes.add_row(flags=1) + tables.nodes.add_row(flags=1) + tables.nodes.add_row(flags=1, time=1) + tables.edges.add_row(0, 1, 2, 0) + tables.edges.add_row(0, 1, 2, 1) + ts = tables.tree_sequence() + self.verify(ts) From 713c4725b16ea9f7897122633f05300a77187797 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Mon, 26 Oct 2020 18:59:28 +0000 Subject: [PATCH 2/9] First go at C implementation of genetic_relatedness --- c/tskit/trees.c | 51 +++++++++++- c/tskit/trees.h | 6 +- python/_tskitmodule.c | 10 +++ python/tests/test_covariance.py | 143 +++++++++++--------------------- python/tskit/trees.py | 59 +++++++++++++ 5 files changed, 171 insertions(+), 98 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 11d2890f97..c2432b833b 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2752,8 +2752,57 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, return ret; } +relatedness_summary_func(size_t state_dim, const double *state, + size_t result_dim, double *result, void *params) +{ + sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; + const double *x = state; + tsk_id_t i, j; + size_t k; + double c = 0; + double sumx = 0; + double num = 0; + + for (k = 0; k < state_dim; k++) { + sumx += x[k]; + } + + for (k = 0; k < result_dim; k++) { + num += args.sample_set_sizes[k]; + } + + if (num != sumx) { + c = 1; + } + + for (k = 0; k < result_dim; k++) { + i = args.set_indexes[2 * k]; + j = args.set_indexes[2 * k + 1]; + result[k] = x[i] * x[j] * c; + } + return 0; +} + +int +tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, + const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, + tsk_flags_t options) +{ + int ret = 0; + ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); + if (ret != 0) { + goto out; + } + ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes, + sample_sets, num_index_tuples, index_tuples, relatedness_summary_func, + num_windows, windows, result, options); +out: + return ret; +} + static int -Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, +Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 8eb8427478..779eeebbd5 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -338,8 +338,6 @@ int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self, const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options); -/* Two way sample set stats */ - typedef int general_sample_stat_method(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes, @@ -357,6 +355,10 @@ int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options); +int tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, + const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, + tsk_flags_t options); /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 12754d7ea5..f3e21f8acf 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -7028,6 +7028,12 @@ TreeSequence_divergence(TreeSequence *self, PyObject *args, PyObject *kwds) return TreeSequence_k_way_stat_method(self, args, kwds, 2, tsk_treeseq_divergence); } +static PyObject * +TreeSequence_relatedness(TreeSequence *self, PyObject *args, PyObject *kwds) +{ + return TreeSequence_k_way_stat_method(self, args, kwds, 2, tsk_treeseq_relatedness); +} + static PyObject * TreeSequence_Y2(TreeSequence *self, PyObject *args, PyObject *kwds) { @@ -7345,6 +7351,10 @@ static PyMethodDef TreeSequence_methods[] = { .ml_meth = (PyCFunction) TreeSequence_divergence, .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Computes diveregence between sample sets." }, + { .ml_name = "relatedness", + .ml_meth = (PyCFunction) TreeSequence_relatedness, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = "Computes genetic relatedness between sample sets." }, { .ml_name = "Y1", .ml_meth = (PyCFunction) TreeSequence_Y1, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py index f62eb8aa77..6b10a53d1f 100644 --- a/python/tests/test_covariance.py +++ b/python/tests/test_covariance.py @@ -38,7 +38,6 @@ def check_cov_tree_inputs(tree): if not len(tree.roots) == 1: raise ValueError("Trees must have one root") - for u in tree.nodes(): if tree.num_children(u) == 1: raise ValueError("Unary nodes are not supported") @@ -52,14 +51,12 @@ def naive_tree_covariance(tree): """ samples = tree.tree_sequence.samples() check_cov_tree_inputs(tree) - n = samples.shape[0] cov = np.zeros((n, n)) for n1, n2 in itertools.combinations_with_replacement(range(n), 2): mrca = tree.mrca(samples[n1], samples[n2]) cov[n1, n2] = tree.time(tree.root) - tree.time(mrca) cov[n2, n1] = cov[n1, n2] - return cov @@ -70,89 +67,34 @@ def naive_ts_covariance(ts): tree covariance, with the weights given by the spans of the trees. """ samples = ts.samples() - n = samples.shape[0] cov = np.zeros((n, n)) for tree in ts.trees(): cov += naive_tree_covariance(tree) * tree.span - return cov / ts.sequence_length -def ts_covariance_incremental(ts): - for tree in ts.trees(): - check_cov_tree_inputs(tree) - - n = ts.num_samples - - this_cov = np.zeros((n, n)) - total_cov = np.zeros((n, n)) - last_update = np.zeros((n, n)) - - for tree, edge_diff in zip(ts.trees(sample_lists=True), ts.edge_diffs()): - (t_left, t_right), edges_out, edges_in = edge_diff - - for e in reversed(edges_out): - u = e.child - if tree.parent(u) == tskit.NULL: - for leaf in tree.leaves(u): - update_cov_pairs_with_leaf( - tree, total_cov, this_cov, last_update, leaf, t_left - ) - - for e in reversed(edges_in): - u = e.child - for leaf in tree.leaves(u): - update_cov_pairs_with_leaf( - tree, total_cov, this_cov, last_update, leaf, t_left - ) - - total_cov += this_cov * (t_right - last_update) - for (n1, n2) in itertools.combinations(range(n), 2): - total_cov[n2, n1] = total_cov[n1, n2] - return total_cov / ts.sequence_length +def genetic_relatedness(ts): + # NOTE: I'm outputting a matrix here just for convenience; the proposal + # is that the tskit method *not* output a matrix, and use the indices argument + sample_sets = [[u] for u in ts.samples()] + n = len(sample_sets) + num_samples = sum(map(len, sample_sets)) + def f(x): + # x[i] gives the number of descendants in sample set i below the branch + return np.array( + [x[i] * x[j] * (sum(x) != num_samples) for i in range(n) for j in range(n)] + ) -def update_cov_pairs_with_leaf(tree, total_cov, this_cov, last_update, leaf, t_left): - """ - Perform an upward traversal from `leaf` to the root, updating the covariance - matrix elements for pairs of `leaf` with every other leaf in the tree. - """ - root_time = tree.time(tree.root) - p = tree.parent(leaf) - c = leaf - - if tree.is_sample(c): - total_cov[c, c] += this_cov[c, c] * (t_left - last_update[c, c]) - last_update[c, c] = t_left - this_cov[c, c] = root_time - tree.time(c) - while p != -1: - time = root_time - tree.time(p) - for sibling in tree.children(p): - if sibling != c: - update_cov_all_pairs( - tree, total_cov, this_cov, last_update, leaf, sibling, time, t_left - ) - c, p = p, tree.parent(p) - - -def update_cov_all_pairs(tree, total_cov, this_cov, last_update, c1, c2, time, t_left): - s1_ind = tree.left_sample(c1) - while True: - s2_ind = tree.left_sample(c2) - while True: - (n1, n2) = (s1_ind, s2_ind) - if n1 > n2: - (n1, n2) = (n2, n1) - total_cov[n1, n2] += this_cov[n1, n2] * (t_left - last_update[n1, n2]) - last_update[n1, n2] = t_left - this_cov[n1, n2] = time - if s2_ind == tree.right_sample(c2): - break - s2_ind = tree.next_sample(s2_ind) - if s1_ind == tree.right_sample(c1): - break - s1_ind = tree.next_sample(s1_ind) + return ts.sample_count_stat( + sample_sets, + f, + output_dim=n * n, + mode="branch", + span_normalise=True, + polarised=True, + ).reshape((n, n)) class TestCovariance(unittest.TestCase): @@ -162,33 +104,44 @@ class TestCovariance(unittest.TestCase): def verify(self, ts): cov1 = naive_ts_covariance(ts) - cov2 = ts_covariance_incremental(ts) + cov2 = genetic_relatedness(ts) + sample_sets = [[u] for u in ts.samples()] + n = len(sample_sets) + indexes = [ + (n1, n2) for n1, n2 in itertools.combinations_with_replacement(range(n), 2) + ] + cov3 = np.zeros((n, n)) + i_upper = np.triu_indices(n) + cov3[i_upper] = ts.genetic_relatedness( + sample_sets, indexes, mode="branch", span_normalise=True + ) + cov3 = np.maximum(cov3, cov3.transpose()) + # cov2 = ts_covariance_incremental(ts) assert np.allclose(cov1, cov2) + assert np.allclose(cov1, cov3) def verify_errors(self, ts): with pytest.raises(ValueError): naive_ts_covariance(ts) - with pytest.raises(ValueError): - ts_covariance_incremental(ts) - def test_errors_unary_nodes(self): - tables = tskit.TableCollection(sequence_length=2.0) + # def test_errors_unary_nodes(self): + # tables = tskit.TableCollection(sequence_length=2.0) - sv = [True, False, False] - tv = [0.0, 1.0, 2.0] - for is_sample, t in zip(sv, tv): - flags = tskit.NODE_IS_SAMPLE if is_sample else 0 - tables.nodes.add_row(flags=flags, time=t) + # sv = [True, False, False] + # tv = [0.0, 1.0, 2.0] + # for is_sample, t in zip(sv, tv): + # flags = tskit.NODE_IS_SAMPLE if is_sample else 0 + # tables.nodes.add_row(flags=flags, time=t) - lv = [0.0, 0.0, 0.0] - rv = [1.0, 1.0, 1.0] - pv = [1, 2] - cv = [0, 1] - for left, right, p, c in zip(lv, rv, pv, cv): - tables.edges.add_row(left=left, right=right, parent=p, child=c) + # lv = [0.0, 0.0, 0.0] + # rv = [1.0, 1.0, 1.0] + # pv = [1, 2] + # cv = [0, 1] + # for left, right, p, c in zip(lv, rv, pv, cv): + # tables.edges.add_row(left=left, right=right, parent=p, child=c) - ts = tables.tree_sequence() - self.verify_errors(ts) + # ts = tables.tree_sequence() + # self.verify_errors(ts) def test_errors_multiroot_tree(self): ts = msprime.simulate(15, random_seed=10) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 3e597148ce..e3a68696c6 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5496,6 +5496,65 @@ def divergence( # k += 1 # return A + def genetic_relatedness( + self, sample_sets, indexes=None, windows=None, mode="site", span_normalise=True + ): + """ + Computes genetic relatedness between (and within) pairs of + sets of nodes from ``sample_sets``. + Operates on ``k = 2`` sample sets at a time; please see the + :ref:`multi-way statistics ` + section for details on how the ``sample_sets`` and ``indexes`` arguments are + interpreted and how they interact with the dimensions of the output array. + See the :ref:`statistics interface ` section for details on + :ref:`windows `, + :ref:`mode `, + :ref:`span normalise `, + and :ref:`return value `. + + As a special case, an index ``(j, j)`` will compute the + :meth:`diversity <.TreeSequence.diversity>` of ``sample_set[j]``. + + What is computed depends on ``mode``: + + "site" + Mean pairwise genetic divergence: the average across distinct, + randomly chosen pairs of chromosomes (one from each sample set), of + the density of sites at which the two carry different alleles, per + unit of chromosome length. + + "branch" + Mean distance in the tree: the average across distinct, randomly + chosen pairs of chromsomes (one from each sample set) and locations + in the window, of the mean distance in the tree between the two + samples (in units of time). + + "node" + For each node, the proportion of genome on which the node is an ancestor to + only one of a random pair (one from each sample set), averaged over + choices of pair. + + :param list sample_sets: A list of lists of Node IDs, specifying the + groups of nodes to compute the statistic with. + :param list indexes: A list of 2-tuples, or None. + :param list windows: An increasing list of breakpoints between the windows + to compute the statistic in. + :param str mode: A string giving the "type" of the statistic to be computed + (defaults to "site"). + :param bool span_normalise: Whether to divide the result by the span of the + window (defaults to True). + :return: A ndarray with shape equal to (num windows, num statistics). + """ + return self.__k_way_sample_set_stat( + self._ll_tree_sequence.relatedness, + 2, + sample_sets, + indexes=indexes, + windows=windows, + mode=mode, + span_normalise=span_normalise, + ) + def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): """ Computes the mean squared covariances between each of the columns of ``W`` From 9373782d6d1f60da1e6ac395c3bda478a6f3b07e Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Wed, 28 Oct 2020 10:58:02 +0000 Subject: [PATCH 3/9] Write c-like summary function in python --- c/tskit/trees.c | 8 +++++-- python/tests/test_covariance.py | 37 ++++++++++++++++++++++++++++++++- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index c2432b833b..4a6e3861f9 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2759,7 +2759,7 @@ relatedness_summary_func(size_t state_dim, const double *state, const double *x = state; tsk_id_t i, j; size_t k; - double c = 0; + int c = 0; double sumx = 0; double num = 0; @@ -2767,7 +2767,7 @@ relatedness_summary_func(size_t state_dim, const double *state, sumx += x[k]; } - for (k = 0; k < result_dim; k++) { + for (k = 0; k < state_dim; k++) { num += args.sample_set_sizes[k]; } @@ -2775,6 +2775,10 @@ relatedness_summary_func(size_t state_dim, const double *state, c = 1; } + // printf("num = %0.7f\n", num); + // printf("sumx = %0.7f\n", sumx); + // printf("c = %d\n", c); + for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py index 6b10a53d1f..5ed858d4dc 100644 --- a/python/tests/test_covariance.py +++ b/python/tests/test_covariance.py @@ -78,6 +78,7 @@ def genetic_relatedness(ts): # NOTE: I'm outputting a matrix here just for convenience; the proposal # is that the tskit method *not* output a matrix, and use the indices argument sample_sets = [[u] for u in ts.samples()] + # sample_sets = [[0], [1]] n = len(sample_sets) num_samples = sum(map(len, sample_sets)) @@ -97,6 +98,37 @@ def f(x): ).reshape((n, n)) +def c_genetic_relatedness(ts, sample_sets, indexes): + n = len(indexes) + state_dim = len(sample_sets) + + def f(x): + # x[i] gives the number of descendants in sample set i below the branch + sumx = 0 + num = 0 + c = 0 + for k in range(state_dim): + sumx += x[k] + + for k in range(state_dim): + num += len(sample_sets[k]) + + if num != sumx: + c = 1 + + result = np.zeros(n) + for k in range(n): + i = indexes[k][0] + j = indexes[k][1] + result[k] = x[i] * x[j] * c + + return result + + return ts.sample_count_stat( + sample_sets, f, output_dim=n, mode="branch", span_normalise=True, polarised=True + ) + + class TestCovariance(unittest.TestCase): """ Tests on covariance matrix computation @@ -111,13 +143,16 @@ def verify(self, ts): (n1, n2) for n1, n2 in itertools.combinations_with_replacement(range(n), 2) ] cov3 = np.zeros((n, n)) + cov4 = np.zeros((n, n)) i_upper = np.triu_indices(n) cov3[i_upper] = ts.genetic_relatedness( sample_sets, indexes, mode="branch", span_normalise=True ) + cov4[i_upper] = c_genetic_relatedness(ts, sample_sets, indexes) cov3 = np.maximum(cov3, cov3.transpose()) - # cov2 = ts_covariance_incremental(ts) + cov4 = np.maximum(cov4, cov4.transpose()) assert np.allclose(cov1, cov2) + assert np.allclose(cov1, cov4) assert np.allclose(cov1, cov3) def verify_errors(self, ts): From 491bf4b09c837b9b78c72839df47417075fce916 Mon Sep 17 00:00:00 2001 From: Jerome Kelleher Date: Wed, 28 Oct 2020 14:56:24 +0000 Subject: [PATCH 4/9] Add polarised option to stats infrastructure. --- c/tskit/trees.c | 3 ++- python/_tskitmodule.c | 14 +++++++++----- python/tests/test_covariance.py | 1 + python/tskit/trees.py | 6 +++--- 4 files changed, 15 insertions(+), 9 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 4a6e3861f9..b6432dacb4 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2752,6 +2752,7 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, return ret; } +static int relatedness_summary_func(size_t state_dim, const double *state, size_t result_dim, double *result, void *params) { @@ -2769,7 +2770,7 @@ relatedness_summary_func(size_t state_dim, const double *state, for (k = 0; k < state_dim; k++) { num += args.sample_set_sizes[k]; - } + } if (num != sumx) { c = 1; diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index f3e21f8acf..bfc695ac65 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -6946,7 +6946,7 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd { PyObject *ret = NULL; static char *kwlist[] = { "sample_set_sizes", "sample_sets", "indexes", "windows", - "mode", "span_normalise", NULL }; + "mode", "span_normalise", "polarised", NULL }; PyObject *sample_set_sizes = NULL; PyObject *sample_sets = NULL; PyObject *indexes = NULL; @@ -6960,14 +6960,15 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd npy_intp *shape; tsk_flags_t options = 0; char *mode = NULL; - int span_normalise = 1; + int span_normalise = true; + int polarised = false; int err; if (TreeSequence_check_tree_sequence(self) != 0) { goto out; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOO|si", kwlist, &sample_set_sizes, - &sample_sets, &indexes, &windows, &mode, &span_normalise)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOO|sii", kwlist, &sample_set_sizes, + &sample_sets, &indexes, &windows, &mode, &span_normalise, &polarised)) { goto out; } if (parse_stats_mode(mode, &options) != 0) { @@ -6976,6 +6977,9 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd if (span_normalise) { options |= TSK_STAT_SPAN_NORMALISE; } + if (polarised) { + options |= TSK_STAT_POLARISED; + } if (parse_sample_sets(sample_set_sizes, &sample_set_sizes_array, sample_sets, &sample_sets_array, &num_sample_sets) != 0) { @@ -7354,7 +7358,7 @@ static PyMethodDef TreeSequence_methods[] = { { .ml_name = "relatedness", .ml_meth = (PyCFunction) TreeSequence_relatedness, .ml_flags = METH_VARARGS | METH_KEYWORDS, - .ml_doc = "Computes genetic relatedness between sample sets." }, + .ml_doc = "Computes genetic relatedness between sample sets." }, { .ml_name = "Y1", .ml_meth = (PyCFunction) TreeSequence_Y1, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py index 5ed858d4dc..f2c88c1b7b 100644 --- a/python/tests/test_covariance.py +++ b/python/tests/test_covariance.py @@ -192,6 +192,7 @@ def test_coalescent_trees(self): assert ts.num_trees > 2 self.verify(ts) + @pytest.mark.skip(reason="internal samples not working") def test_internal_samples(self): nodes = io.StringIO( """\ diff --git a/python/tskit/trees.py b/python/tskit/trees.py index e3a68696c6..fe6d7350b8 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5308,6 +5308,7 @@ def __k_way_sample_set_stat( windows=None, mode=None, span_normalise=True, + polarised=False, ): sample_set_sizes = np.array( [len(sample_set) for sample_set in sample_sets], dtype=np.uint32 @@ -5340,6 +5341,7 @@ def __k_way_sample_set_stat( indexes, mode=mode, span_normalise=span_normalise, + polarised=polarised, ) if drop_dimension: stat = stat.reshape(stat.shape[:-1]) @@ -5512,9 +5514,6 @@ def genetic_relatedness( :ref:`span normalise `, and :ref:`return value `. - As a special case, an index ``(j, j)`` will compute the - :meth:`diversity <.TreeSequence.diversity>` of ``sample_set[j]``. - What is computed depends on ``mode``: "site" @@ -5553,6 +5552,7 @@ def genetic_relatedness( windows=windows, mode=mode, span_normalise=span_normalise, + polarised=True, ) def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): From dffe79b52b72bb3b6ac29f2ab7c94f75d7764fdf Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Fri, 30 Oct 2020 11:54:49 +0000 Subject: [PATCH 5/9] Change implementation to be in line with genotype covariance --- c/tests/test_stats.c | 12 +++ c/tskit/trees.c | 10 +-- python/tests/test_covariance.py | 154 +++++++++++++++++++------------- python/tskit/trees.py | 2 +- 4 files changed, 108 insertions(+), 70 deletions(-) diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 63ac3427cf..89138efcea 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -1333,6 +1333,17 @@ test_paper_ex_divergence(void) tsk_treeseq_free(&ts); } +static void +test_paper_ex_relatedness_errors(void) +{ + tsk_treeseq_t ts; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + verify_two_way_stat_func_errors(&ts, tsk_treeseq_relatedness); + tsk_treeseq_free(&ts); +} + static void test_paper_ex_Y2_errors(void) { @@ -1679,6 +1690,7 @@ main(int argc, char **argv) { "test_paper_ex_Y1", test_paper_ex_Y1 }, { "test_paper_ex_divergence_errors", test_paper_ex_divergence_errors }, { "test_paper_ex_divergence", test_paper_ex_divergence }, + { "test_paper_ex_relatedness_errors", test_paper_ex_relatedness_errors }, { "test_paper_ex_Y2_errors", test_paper_ex_Y2_errors }, { "test_paper_ex_Y2", test_paper_ex_Y2 }, { "test_paper_ex_f2_errors", test_paper_ex_f2_errors }, diff --git a/c/tskit/trees.c b/c/tskit/trees.c index b6432dacb4..c30e0cb16c 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2762,6 +2762,7 @@ relatedness_summary_func(size_t state_dim, const double *state, size_t k; int c = 0; double sumx = 0; + double meanx; double num = 0; for (k = 0; k < state_dim; k++) { @@ -2775,15 +2776,12 @@ relatedness_summary_func(size_t state_dim, const double *state, if (num != sumx) { c = 1; } - - // printf("num = %0.7f\n", num); - // printf("sumx = %0.7f\n", sumx); - // printf("c = %d\n", c); - + meanx = sumx / (double) state_dim; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; - result[k] = x[i] * x[j] * c; + // result[k] = x[i] * x[j] * c; + result[k] = (x[i] - meanx) * (x[j] - meanx); } return 0; } diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py index f2c88c1b7b..cb1a1da124 100644 --- a/python/tests/test_covariance.py +++ b/python/tests/test_covariance.py @@ -74,7 +74,14 @@ def naive_ts_covariance(ts): return cov / ts.sequence_length -def genetic_relatedness(ts): +def naive_genotype_covariance(ts): + G = ts.genotype_matrix() + # p = G.shape[0] + G = G.T - np.mean(G, axis=1) + return G @ G.T # / p + + +def genetic_relatedness(ts, mode="site", polarised=True): # NOTE: I'm outputting a matrix here just for convenience; the proposal # is that the tskit method *not* output a matrix, and use the indices argument sample_sets = [[u] for u in ts.samples()] @@ -92,40 +99,62 @@ def f(x): sample_sets, f, output_dim=n * n, - mode="branch", + mode=mode, span_normalise=True, - polarised=True, + polarised=polarised, ).reshape((n, n)) -def c_genetic_relatedness(ts, sample_sets, indexes): - n = len(indexes) +def genotype_relatedness(ts, polarised=False): + n = ts.num_samples + sample_sets = [[u] for u in ts.samples()] + + def f(x): + return np.array( + [ + (x[i] - sum(x) / n) * (x[j] - sum(x) / n) + for i in range(n) + for j in range(n) + ] + ) + + return ( + ts.sample_count_stat( + sample_sets, + f, + output_dim=n * n, + mode="site", + span_normalise=False, + polarised=polarised, + ).reshape((n, n)) + / 2 + ) + + +def c_genotype_relatedness(ts, sample_sets, indexes): + m = len(indexes) state_dim = len(sample_sets) def f(x): - # x[i] gives the number of descendants in sample set i below the branch sumx = 0 - num = 0 - c = 0 for k in range(state_dim): sumx += x[k] - - for k in range(state_dim): - num += len(sample_sets[k]) - - if num != sumx: - c = 1 - - result = np.zeros(n) - for k in range(n): + meanx = sumx / state_dim + result = np.zeros(m) + for k in range(m): i = indexes[k][0] j = indexes[k][1] - result[k] = x[i] * x[j] * c - + result[k] = (x[i] - meanx) * (x[j] - meanx) / 2 return result return ts.sample_count_stat( - sample_sets, f, output_dim=n, mode="branch", span_normalise=True, polarised=True + sample_sets, + f, + output_dim=m, + mode="site", + span_normalise=False, + polarised=False, + strict=False, ) @@ -135,8 +164,10 @@ class TestCovariance(unittest.TestCase): """ def verify(self, ts): - cov1 = naive_ts_covariance(ts) - cov2 = genetic_relatedness(ts) + # cov1 = naive_ts_covariance(ts) + cov1 = naive_genotype_covariance(ts) + # cov2 = genetic_relatedness(ts) + cov2 = genotype_relatedness(ts) sample_sets = [[u] for u in ts.samples()] n = len(sample_sets) indexes = [ @@ -145,12 +176,16 @@ def verify(self, ts): cov3 = np.zeros((n, n)) cov4 = np.zeros((n, n)) i_upper = np.triu_indices(n) - cov3[i_upper] = ts.genetic_relatedness( - sample_sets, indexes, mode="branch", span_normalise=True - ) - cov4[i_upper] = c_genetic_relatedness(ts, sample_sets, indexes) - cov3 = np.maximum(cov3, cov3.transpose()) - cov4 = np.maximum(cov4, cov4.transpose()) + cov3[i_upper] = ( + ts.genetic_relatedness( + sample_sets, indexes, mode="site", span_normalise=False + ) + / 2 + ) # NOTE: divided by 2 to reflect unpolarised + cov3 = cov3 + cov3.T - np.diag(cov3.diagonal()) + cov4[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes) + cov4 = cov4 + cov4.T - np.diag(cov4.diagonal()) + # assert np.allclose(cov2, cov3) assert np.allclose(cov1, cov2) assert np.allclose(cov1, cov4) assert np.allclose(cov1, cov3) @@ -159,40 +194,22 @@ def verify_errors(self, ts): with pytest.raises(ValueError): naive_ts_covariance(ts) - # def test_errors_unary_nodes(self): - # tables = tskit.TableCollection(sequence_length=2.0) - - # sv = [True, False, False] - # tv = [0.0, 1.0, 2.0] - # for is_sample, t in zip(sv, tv): - # flags = tskit.NODE_IS_SAMPLE if is_sample else 0 - # tables.nodes.add_row(flags=flags, time=t) - - # lv = [0.0, 0.0, 0.0] - # rv = [1.0, 1.0, 1.0] - # pv = [1, 2] - # cv = [0, 1] - # for left, right, p, c in zip(lv, rv, pv, cv): - # tables.edges.add_row(left=left, right=right, parent=p, child=c) - - # ts = tables.tree_sequence() - # self.verify_errors(ts) - def test_errors_multiroot_tree(self): - ts = msprime.simulate(15, random_seed=10) + ts = msprime.simulate(15, random_seed=10, mutation_rate=1) ts = tsutil.decapitate(ts, ts.num_edges // 2) self.verify_errors(ts) def test_single_coalescent_tree(self): - ts = msprime.simulate(10, random_seed=1, length=10) + ts = msprime.simulate(10, random_seed=1, length=10, mutation_rate=1) self.verify(ts) def test_coalescent_trees(self): - ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2) + ts = msprime.simulate( + 8, recombination_rate=5, random_seed=1, length=2, mutation_rate=1 + ) assert ts.num_trees > 2 self.verify(ts) - @pytest.mark.skip(reason="internal samples not working") def test_internal_samples(self): nodes = io.StringIO( """\ @@ -220,12 +237,32 @@ def test_internal_samples(self): 0.0 0.2 7 0,5 """ ) - ts = tskit.load_text(nodes=nodes, edges=edges, strict=False) + sites = io.StringIO( + """\ + position ancestral_state + 0.1 0 + 0.5 0 + 0.9 0 + """ + ) + mutations = io.StringIO( + """\ + site node derived_state + 0 1 1 + 1 3 1 + 2 5 1 + """ + ) + ts = tskit.load_text( + nodes=nodes, edges=edges, sites=sites, mutations=mutations, strict=False + ) self.verify(ts) def validate_trees(self, n): for seed in range(1, 10): - ts = msprime.simulate(n, random_seed=seed, recombination_rate=1) + ts = msprime.simulate( + n, random_seed=seed, recombination_rate=1, mutation_rate=1 + ) self.verify(ts) def test_sample_5(self): @@ -249,6 +286,7 @@ def validate_nonbinary_trees(self, n): random_seed=seed, demographic_events=demographic_events, recombination_rate=1, + mutation_rate=5, ) # Check if this is really nonbinary found = False @@ -265,13 +303,3 @@ def test_non_binary_sample_10(self): def test_non_binary_sample_20(self): self.validate_nonbinary_trees(20) - - def test_permit_internal_samples(self): - tables = tskit.TableCollection(1.0) - tables.nodes.add_row(flags=1) - tables.nodes.add_row(flags=1) - tables.nodes.add_row(flags=1, time=1) - tables.edges.add_row(0, 1, 2, 0) - tables.edges.add_row(0, 1, 2, 1) - ts = tables.tree_sequence() - self.verify(ts) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index fe6d7350b8..20edc31807 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5552,7 +5552,7 @@ def genetic_relatedness( windows=windows, mode=mode, span_normalise=span_normalise, - polarised=True, + polarised=False, ) def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): From 5e657b2768ca19b659719fa3431641d1270b5fe8 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Tue, 3 Nov 2020 14:11:31 +0000 Subject: [PATCH 6/9] Adjust tests and change default arguments Add hack to divide by 2 - polarised in C implementation --- c/tskit/trees.c | 10 --- python/tests/test_covariance.py | 139 +++++++++----------------------- python/tskit/trees.py | 52 ++++++------ 3 files changed, 61 insertions(+), 140 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index c30e0cb16c..19fce24487 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2760,27 +2760,17 @@ relatedness_summary_func(size_t state_dim, const double *state, const double *x = state; tsk_id_t i, j; size_t k; - int c = 0; double sumx = 0; double meanx; - double num = 0; for (k = 0; k < state_dim; k++) { sumx += x[k]; } - for (k = 0; k < state_dim; k++) { - num += args.sample_set_sizes[k]; - } - - if (num != sumx) { - c = 1; - } meanx = sumx / (double) state_dim; for (k = 0; k < result_dim; k++) { i = args.set_indexes[2 * k]; j = args.set_indexes[2 * k + 1]; - // result[k] = x[i] * x[j] * c; result[k] = (x[i] - meanx) * (x[j] - meanx); } return 0; diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py index cb1a1da124..11891cc29e 100644 --- a/python/tests/test_covariance.py +++ b/python/tests/test_covariance.py @@ -29,83 +29,22 @@ import msprime import numpy as np -import pytest -import tests.tsutil as tsutil import tskit -def check_cov_tree_inputs(tree): - if not len(tree.roots) == 1: - raise ValueError("Trees must have one root") - for u in tree.nodes(): - if tree.num_children(u) == 1: - raise ValueError("Unary nodes are not supported") - - -def naive_tree_covariance(tree): - """ - Returns the (branch) covariance matrix for the sample nodes in a tree. The - covariance between a pair of nodes is the distance from the root to their - most recent common ancestor. - """ - samples = tree.tree_sequence.samples() - check_cov_tree_inputs(tree) - n = samples.shape[0] - cov = np.zeros((n, n)) - for n1, n2 in itertools.combinations_with_replacement(range(n), 2): - mrca = tree.mrca(samples[n1], samples[n2]) - cov[n1, n2] = tree.time(tree.root) - tree.time(mrca) - cov[n2, n1] = cov[n1, n2] - return cov - - -def naive_ts_covariance(ts): - """ - Returns the (branch) covariance matrix for the sample nodes in a tree - sequence. The covariance between a pair of nodes is the weighted sum of the - tree covariance, with the weights given by the spans of the trees. - """ - samples = ts.samples() - n = samples.shape[0] - cov = np.zeros((n, n)) - for tree in ts.trees(): - cov += naive_tree_covariance(tree) * tree.span - return cov / ts.sequence_length - - -def naive_genotype_covariance(ts): +def naive_genotype_covariance(ts, proportion=False): G = ts.genotype_matrix() - # p = G.shape[0] + denominator = ts.sequence_length + if proportion: + all_samples = ts.samples() + num = ts.segregating_sites(all_samples) + denominator = denominator * num G = G.T - np.mean(G, axis=1) - return G @ G.T # / p - - -def genetic_relatedness(ts, mode="site", polarised=True): - # NOTE: I'm outputting a matrix here just for convenience; the proposal - # is that the tskit method *not* output a matrix, and use the indices argument - sample_sets = [[u] for u in ts.samples()] - # sample_sets = [[0], [1]] - n = len(sample_sets) - num_samples = sum(map(len, sample_sets)) - - def f(x): - # x[i] gives the number of descendants in sample set i below the branch - return np.array( - [x[i] * x[j] * (sum(x) != num_samples) for i in range(n) for j in range(n)] - ) + return G @ G.T / denominator - return ts.sample_count_stat( - sample_sets, - f, - output_dim=n * n, - mode=mode, - span_normalise=True, - polarised=polarised, - ).reshape((n, n)) - -def genotype_relatedness(ts, polarised=False): +def genotype_relatedness(ts, polarised=False, proportion=False): n = ts.num_samples sample_sets = [[u] for u in ts.samples()] @@ -118,20 +57,25 @@ def f(x): ] ) + denominator = 2 - polarised + if proportion: + all_samples = list({u for s in sample_sets for u in s}) + num = ts.segregating_sites(all_samples) + denominator = denominator * num return ( ts.sample_count_stat( sample_sets, f, output_dim=n * n, mode="site", - span_normalise=False, + span_normalise=True, polarised=polarised, ).reshape((n, n)) - / 2 + / denominator ) -def c_genotype_relatedness(ts, sample_sets, indexes): +def c_genotype_relatedness(ts, sample_sets, indexes, polarised=False, proportion=False): m = len(indexes) state_dim = len(sample_sets) @@ -144,17 +88,25 @@ def f(x): for k in range(m): i = indexes[k][0] j = indexes[k][1] - result[k] = (x[i] - meanx) * (x[j] - meanx) / 2 + result[k] = (x[i] - meanx) * (x[j] - meanx) return result - return ts.sample_count_stat( - sample_sets, - f, - output_dim=m, - mode="site", - span_normalise=False, - polarised=False, - strict=False, + denominator = 2 - polarised + if proportion: + all_samples = list({u for s in sample_sets for u in s}) + num = ts.segregating_sites(all_samples) + denominator = denominator * num + return ( + ts.sample_count_stat( + sample_sets, + f, + output_dim=m, + mode="site", + span_normalise=True, + polarised=False, + strict=False, + ) + / denominator ) @@ -164,9 +116,7 @@ class TestCovariance(unittest.TestCase): """ def verify(self, ts): - # cov1 = naive_ts_covariance(ts) cov1 = naive_genotype_covariance(ts) - # cov2 = genetic_relatedness(ts) cov2 = genotype_relatedness(ts) sample_sets = [[u] for u in ts.samples()] n = len(sample_sets) @@ -176,28 +126,15 @@ def verify(self, ts): cov3 = np.zeros((n, n)) cov4 = np.zeros((n, n)) i_upper = np.triu_indices(n) - cov3[i_upper] = ( - ts.genetic_relatedness( - sample_sets, indexes, mode="site", span_normalise=False - ) - / 2 - ) # NOTE: divided by 2 to reflect unpolarised + cov3[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes) cov3 = cov3 + cov3.T - np.diag(cov3.diagonal()) - cov4[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes) + cov4[i_upper] = ts.genetic_relatedness( + sample_sets, indexes, mode="site", span_normalise=True + ) cov4 = cov4 + cov4.T - np.diag(cov4.diagonal()) - # assert np.allclose(cov2, cov3) assert np.allclose(cov1, cov2) - assert np.allclose(cov1, cov4) assert np.allclose(cov1, cov3) - - def verify_errors(self, ts): - with pytest.raises(ValueError): - naive_ts_covariance(ts) - - def test_errors_multiroot_tree(self): - ts = msprime.simulate(15, random_seed=10, mutation_rate=1) - ts = tsutil.decapitate(ts, ts.num_edges // 2) - self.verify_errors(ts) + assert np.allclose(cov1, cov4) def test_single_coalescent_tree(self): ts = msprime.simulate(10, random_seed=1, length=10, mutation_rate=1) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 20edc31807..0fb80ec4c1 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5499,7 +5499,14 @@ def divergence( # return A def genetic_relatedness( - self, sample_sets, indexes=None, windows=None, mode="site", span_normalise=True + self, + sample_sets, + indexes=None, + windows=None, + mode="site", + span_normalise=True, + polarised=False, + proportion=True, ): """ Computes genetic relatedness between (and within) pairs of @@ -5512,27 +5519,9 @@ def genetic_relatedness( :ref:`windows `, :ref:`mode `, :ref:`span normalise `, + :ref:`polarised `, and :ref:`return value `. - What is computed depends on ``mode``: - - "site" - Mean pairwise genetic divergence: the average across distinct, - randomly chosen pairs of chromosomes (one from each sample set), of - the density of sites at which the two carry different alleles, per - unit of chromosome length. - - "branch" - Mean distance in the tree: the average across distinct, randomly - chosen pairs of chromsomes (one from each sample set) and locations - in the window, of the mean distance in the tree between the two - samples (in units of time). - - "node" - For each node, the proportion of genome on which the node is an ancestor to - only one of a random pair (one from each sample set), averaged over - choices of pair. - :param list sample_sets: A list of lists of Node IDs, specifying the groups of nodes to compute the statistic with. :param list indexes: A list of 2-tuples, or None. @@ -5542,17 +5531,22 @@ def genetic_relatedness( (defaults to "site"). :param bool span_normalise: Whether to divide the result by the span of the window (defaults to True). + :param bool proportion: Whether to divide the result by the number of + segregating sites (defaults to True). :return: A ndarray with shape equal to (num windows, num statistics). """ - return self.__k_way_sample_set_stat( - self._ll_tree_sequence.relatedness, - 2, - sample_sets, - indexes=indexes, - windows=windows, - mode=mode, - span_normalise=span_normalise, - polarised=False, + return ( + self.__k_way_sample_set_stat( + self._ll_tree_sequence.relatedness, + 2, + sample_sets, + indexes=indexes, + windows=windows, + mode=mode, + span_normalise=span_normalise, + polarised=polarised, + ) + / (2 - polarised) ) def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): From 9c49e1c2f68ed91c828cd2247061fd52bf0c0822 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Tue, 3 Nov 2020 18:11:25 +0000 Subject: [PATCH 7/9] Remove polarised in divisor --- python/tskit/trees.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 0fb80ec4c1..741597ff35 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -5546,7 +5546,7 @@ def genetic_relatedness( span_normalise=span_normalise, polarised=polarised, ) - / (2 - polarised) + / 2 ) def trait_covariance(self, W, windows=None, mode="site", span_normalise=True): From 72f901c279a3e70d68d4ce92c7405e8ef8c88bf4 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Tue, 3 Nov 2020 18:58:52 +0000 Subject: [PATCH 8/9] Lint c files --- c/tskit/trees.c | 12 ++++++------ c/tskit/trees.h | 6 +++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 19fce24487..4a130c0d47 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -2753,8 +2753,8 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, } static int -relatedness_summary_func(size_t state_dim, const double *state, - size_t result_dim, double *result, void *params) +relatedness_summary_func(size_t state_dim, const double *state, size_t result_dim, + double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; const double *x = state; @@ -2778,9 +2778,9 @@ relatedness_summary_func(size_t state_dim, const double *state, int tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, - const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, - tsk_flags_t options) + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, + const double *windows, double *result, tsk_flags_t options) { int ret = 0; ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples); @@ -2795,7 +2795,7 @@ tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, } static int -Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, +Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, double *result, void *params) { sample_count_stat_params_t args = *(sample_count_stat_params_t *) params; diff --git a/c/tskit/trees.h b/c/tskit/trees.h index 779eeebbd5..762d8afcda 100644 --- a/c/tskit/trees.h +++ b/c/tskit/trees.h @@ -356,9 +356,9 @@ int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, tsk_flags_t options); int tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, - const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples, - const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result, - tsk_flags_t options); + const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, + tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows, + const double *windows, double *result, tsk_flags_t options); /* Three way sample set stats */ int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets, From f250616d10c3a08f8430fbf4d2d8d526bac65c03 Mon Sep 17 00:00:00 2001 From: brieuc-mac Date: Tue, 3 Nov 2020 20:28:47 +0000 Subject: [PATCH 9/9] Add c test --- c/tests/test_stats.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c index 89138efcea..b6cd6d770c 100644 --- a/c/tests/test_stats.c +++ b/c/tests/test_stats.c @@ -1333,6 +1333,25 @@ test_paper_ex_divergence(void) tsk_treeseq_free(&ts); } +static void +test_paper_ex_relatedness(void) +{ + tsk_treeseq_t ts; + tsk_id_t samples[] = { 0, 1, 2, 3 }; + tsk_size_t sample_set_sizes[] = { 2, 2 }; + tsk_id_t set_indexes[] = { 0, 0 }; + double result; + int ret; + + tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites, + paper_ex_mutations, paper_ex_individuals, NULL, 0); + + ret = tsk_treeseq_relatedness(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0, + NULL, &result, TSK_STAT_SITE); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts); +} + static void test_paper_ex_relatedness_errors(void) { @@ -1691,6 +1710,7 @@ main(int argc, char **argv) { "test_paper_ex_divergence_errors", test_paper_ex_divergence_errors }, { "test_paper_ex_divergence", test_paper_ex_divergence }, { "test_paper_ex_relatedness_errors", test_paper_ex_relatedness_errors }, + { "test_paper_ex_relatedness", test_paper_ex_relatedness }, { "test_paper_ex_Y2_errors", test_paper_ex_Y2_errors }, { "test_paper_ex_Y2", test_paper_ex_Y2 }, { "test_paper_ex_f2_errors", test_paper_ex_f2_errors },