Adjust tests and change default arguments

brieuc-mac · brieuc-mac · commit 9e47224e1e05 · 2020-11-03T14:12:08.000Z
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
@@ -2760,27 +2760,17 @@ relatedness_summary_func(size_t state_dim, const double *state,
     const double *x = state;
     tsk_id_t i, j;
     size_t k;
-    int c = 0;
     double sumx = 0;
     double meanx;
-    double num = 0;
 
     for (k = 0; k < state_dim; k++) {
         sumx += x[k];
     }
 
-    for (k = 0; k < state_dim; k++) {
-        num += args.sample_set_sizes[k];
-    }
-
-    if (num != sumx) {
-        c = 1;
-    }
     meanx = sumx / (double) state_dim;
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
-        // result[k] = x[i] * x[j] * c;
         result[k] = (x[i] - meanx) * (x[j] - meanx);
     }
     return 0;
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
@@ -29,83 +29,22 @@
 
 import msprime
 import numpy as np
-import pytest
 
-import tests.tsutil as tsutil
 import tskit
 
 
-def check_cov_tree_inputs(tree):
-    if not len(tree.roots) == 1:
-        raise ValueError("Trees must have one root")
-    for u in tree.nodes():
-        if tree.num_children(u) == 1:
-            raise ValueError("Unary nodes are not supported")
-
-
-def naive_tree_covariance(tree):
-    """
-    Returns the (branch) covariance matrix for the sample nodes in a tree. The
-    covariance between a pair of nodes is the distance from the root to their
-    most recent common ancestor.
-    """
-    samples = tree.tree_sequence.samples()
-    check_cov_tree_inputs(tree)
-    n = samples.shape[0]
-    cov = np.zeros((n, n))
-    for n1, n2 in itertools.combinations_with_replacement(range(n), 2):
-        mrca = tree.mrca(samples[n1], samples[n2])
-        cov[n1, n2] = tree.time(tree.root) - tree.time(mrca)
-        cov[n2, n1] = cov[n1, n2]
-    return cov
-
-
-def naive_ts_covariance(ts):
-    """
-    Returns the (branch) covariance matrix for the sample nodes in a tree
-    sequence. The covariance between a pair of nodes is the weighted sum of the
-    tree covariance, with the weights given by the spans of the trees.
-    """
-    samples = ts.samples()
-    n = samples.shape[0]
-    cov = np.zeros((n, n))
-    for tree in ts.trees():
-        cov += naive_tree_covariance(tree) * tree.span
-    return cov / ts.sequence_length
-
-
-def naive_genotype_covariance(ts):
+def naive_genotype_covariance(ts, proportion=False):
     G = ts.genotype_matrix()
-    # p = G.shape[0]
+    denominator = ts.sequence_length
+    if proportion:
+        all_samples = ts.samples()
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
     G = G.T - np.mean(G, axis=1)
-    return G @ G.T  # / p
-
-
-def genetic_relatedness(ts, mode="site", polarised=True):
-    # NOTE: I'm outputting a matrix here just for convenience; the proposal
-    # is that the tskit method *not* output a matrix, and use the indices argument
-    sample_sets = [[u] for u in ts.samples()]
-    # sample_sets = [[0], [1]]
-    n = len(sample_sets)
-    num_samples = sum(map(len, sample_sets))
-
-    def f(x):
-        # x[i] gives the number of descendants in sample set i below the branch
-        return np.array(
-            [x[i] * x[j] * (sum(x) != num_samples) for i in range(n) for j in range(n)]
-        )
+    return G @ G.T / denominator
 
-    return ts.sample_count_stat(
-        sample_sets,
-        f,
-        output_dim=n * n,
-        mode=mode,
-        span_normalise=True,
-        polarised=polarised,
-    ).reshape((n, n))
 
-
-def genotype_relatedness(ts, polarised=False):
+def genotype_relatedness(ts, polarised=False, proportion=False):
     n = ts.num_samples
     sample_sets = [[u] for u in ts.samples()]
 
@@ -118,20 +57,25 @@ def f(x):
             ]
         )
 
+    denominator = 2 - polarised
+    if proportion:
+        all_samples = list({u for s in sample_sets for u in s})
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
     return (
         ts.sample_count_stat(
             sample_sets,
             f,
             output_dim=n * n,
             mode="site",
-            span_normalise=False,
+            span_normalise=True,
             polarised=polarised,
         ).reshape((n, n))
-        / 2
+        / denominator
     )
 
 
-def c_genotype_relatedness(ts, sample_sets, indexes):
+def c_genotype_relatedness(ts, sample_sets, indexes, polarised=False, proportion=False):
     m = len(indexes)
     state_dim = len(sample_sets)
 
@@ -144,17 +88,25 @@ def f(x):
         for k in range(m):
             i = indexes[k][0]
             j = indexes[k][1]
-            result[k] = (x[i] - meanx) * (x[j] - meanx) / 2
+            result[k] = (x[i] - meanx) * (x[j] - meanx)
         return result
 
-    return ts.sample_count_stat(
-        sample_sets,
-        f,
-        output_dim=m,
-        mode="site",
-        span_normalise=False,
-        polarised=False,
-        strict=False,
+    denominator = 2 - polarised
+    if proportion:
+        all_samples = list({u for s in sample_sets for u in s})
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
+    return (
+        ts.sample_count_stat(
+            sample_sets,
+            f,
+            output_dim=m,
+            mode="site",
+            span_normalise=True,
+            polarised=False,
+            strict=False,
+        )
+        / denominator
     )
 
 
@@ -164,9 +116,7 @@ class TestCovariance(unittest.TestCase):
     """
 
     def verify(self, ts):
-        # cov1 = naive_ts_covariance(ts)
         cov1 = naive_genotype_covariance(ts)
-        # cov2 = genetic_relatedness(ts)
         cov2 = genotype_relatedness(ts)
         sample_sets = [[u] for u in ts.samples()]
         n = len(sample_sets)
@@ -176,28 +126,15 @@ def verify(self, ts):
         cov3 = np.zeros((n, n))
         cov4 = np.zeros((n, n))
         i_upper = np.triu_indices(n)
-        cov3[i_upper] = (
-            ts.genetic_relatedness(
-                sample_sets, indexes, mode="site", span_normalise=False
-            )
-            / 2
-        )  # NOTE: divided by 2 to reflect unpolarised
+        cov3[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes)
         cov3 = cov3 + cov3.T - np.diag(cov3.diagonal())
-        cov4[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes)
+        cov4[i_upper] = ts.genetic_relatedness(
+            sample_sets, indexes, mode="site", span_normalise=True
+        )
         cov4 = cov4 + cov4.T - np.diag(cov4.diagonal())
-        # assert np.allclose(cov2, cov3)
         assert np.allclose(cov1, cov2)
-        assert np.allclose(cov1, cov4)
         assert np.allclose(cov1, cov3)
-
-    def verify_errors(self, ts):
-        with pytest.raises(ValueError):
-            naive_ts_covariance(ts)
-
-    def test_errors_multiroot_tree(self):
-        ts = msprime.simulate(15, random_seed=10, mutation_rate=1)
-        ts = tsutil.decapitate(ts, ts.num_edges // 2)
-        self.verify_errors(ts)
+        assert np.allclose(cov1, cov4)
 
     def test_single_coalescent_tree(self):
         ts = msprime.simulate(10, random_seed=1, length=10, mutation_rate=1)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
@@ -5499,7 +5499,14 @@ def divergence(
     #     return A
 
     def genetic_relatedness(
-        self, sample_sets, indexes=None, windows=None, mode="site", span_normalise=True
+        self,
+        sample_sets,
+        indexes=None,
+        windows=None,
+        mode="site",
+        span_normalise=True,
+        polarised=False,
+        proportion=True,
     ):
         """
         Computes genetic relatedness between (and within) pairs of
@@ -5512,27 +5519,9 @@ def genetic_relatedness(
         :ref:`windows <sec_stats_windows>`,
         :ref:`mode <sec_stats_mode>`,
         :ref:`span normalise <sec_stats_span_normalise>`,
+        :ref:`polarised <sec_stats_polarisation>`,
         and :ref:`return value <sec_stats_output_format>`.
 
-        What is computed depends on ``mode``:
-
-        "site"
-            Mean pairwise genetic divergence: the average across distinct,
-            randomly chosen pairs of chromosomes (one from each sample set), of
-            the density of sites at which the two carry different alleles, per
-            unit of chromosome length.
-
-        "branch"
-            Mean distance in the tree: the average across distinct, randomly
-            chosen pairs of chromsomes (one from each sample set) and locations
-            in the window, of the mean distance in the tree between the two
-            samples (in units of time).
-
-        "node"
-            For each node, the proportion of genome on which the node is an ancestor to
-            only one of a random pair (one from each sample set), averaged over
-            choices of pair.
-
         :param list sample_sets: A list of lists of Node IDs, specifying the
             groups of nodes to compute the statistic with.
         :param list indexes: A list of 2-tuples, or None.
@@ -5542,6 +5531,8 @@ def genetic_relatedness(
             (defaults to "site").
         :param bool span_normalise: Whether to divide the result by the span of the
             window (defaults to True).
+        :param bool proportion: Whether to divide the result by the number of
+            segregating sites (defaults to True).
         :return: A ndarray with shape equal to (num windows, num statistics).
         """
         return self.__k_way_sample_set_stat(
@@ -5552,7 +5543,7 @@ def genetic_relatedness(
             windows=windows,
             mode=mode,
             span_normalise=span_normalise,
-            polarised=False,
+            polarised=polarised,
         )
 
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):