From 1eb63e00d986983540e775afc74608b39bd602b7 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Thu, 8 Oct 2020 14:00:47 +0100
Subject: [PATCH 1/9] Add python function for covariance matrix

---
 python/tests/test_covariance.py | 288 ++++++++++++++++++++++++++++++++
 1 file changed, 288 insertions(+)
 create mode 100644 python/tests/test_covariance.py

diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
new file mode 100644
index 0000000000..f62eb8aa77
--- /dev/null
+++ b/python/tests/test_covariance.py
@@ -0,0 +1,288 @@
+# MIT License
+#
+# Copyright (c) 2018-2020 Tskit Developers
+# Copyright (c) 2016-2017 University of Oxford
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Test cases for covariance computation.
+"""
+import io
+import itertools
+import unittest
+
+import msprime
+import numpy as np
+import pytest
+
+import tests.tsutil as tsutil
+import tskit
+
+
+def check_cov_tree_inputs(tree):
+    if not len(tree.roots) == 1:
+        raise ValueError("Trees must have one root")
+
+    for u in tree.nodes():
+        if tree.num_children(u) == 1:
+            raise ValueError("Unary nodes are not supported")
+
+
+def naive_tree_covariance(tree):
+    """
+    Returns the (branch) covariance matrix for the sample nodes in a tree. The
+    covariance between a pair of nodes is the distance from the root to their
+    most recent common ancestor.
+    """
+    samples = tree.tree_sequence.samples()
+    check_cov_tree_inputs(tree)
+
+    n = samples.shape[0]
+    cov = np.zeros((n, n))
+    for n1, n2 in itertools.combinations_with_replacement(range(n), 2):
+        mrca = tree.mrca(samples[n1], samples[n2])
+        cov[n1, n2] = tree.time(tree.root) - tree.time(mrca)
+        cov[n2, n1] = cov[n1, n2]
+
+    return cov
+
+
+def naive_ts_covariance(ts):
+    """
+    Returns the (branch) covariance matrix for the sample nodes in a tree
+    sequence. The covariance between a pair of nodes is the weighted sum of the
+    tree covariance, with the weights given by the spans of the trees.
+    """
+    samples = ts.samples()
+
+    n = samples.shape[0]
+    cov = np.zeros((n, n))
+    for tree in ts.trees():
+        cov += naive_tree_covariance(tree) * tree.span
+
+    return cov / ts.sequence_length
+
+
+def ts_covariance_incremental(ts):
+    for tree in ts.trees():
+        check_cov_tree_inputs(tree)
+
+    n = ts.num_samples
+
+    this_cov = np.zeros((n, n))
+    total_cov = np.zeros((n, n))
+    last_update = np.zeros((n, n))
+
+    for tree, edge_diff in zip(ts.trees(sample_lists=True), ts.edge_diffs()):
+        (t_left, t_right), edges_out, edges_in = edge_diff
+
+        for e in reversed(edges_out):
+            u = e.child
+            if tree.parent(u) == tskit.NULL:
+                for leaf in tree.leaves(u):
+                    update_cov_pairs_with_leaf(
+                        tree, total_cov, this_cov, last_update, leaf, t_left
+                    )
+
+        for e in reversed(edges_in):
+            u = e.child
+            for leaf in tree.leaves(u):
+                update_cov_pairs_with_leaf(
+                    tree, total_cov, this_cov, last_update, leaf, t_left
+                )
+
+    total_cov += this_cov * (t_right - last_update)
+    for (n1, n2) in itertools.combinations(range(n), 2):
+        total_cov[n2, n1] = total_cov[n1, n2]
+    return total_cov / ts.sequence_length
+
+
+def update_cov_pairs_with_leaf(tree, total_cov, this_cov, last_update, leaf, t_left):
+    """
+    Perform an upward traversal from `leaf` to the root, updating the covariance
+    matrix elements for pairs of `leaf` with every other leaf in the tree.
+    """
+    root_time = tree.time(tree.root)
+    p = tree.parent(leaf)
+    c = leaf
+
+    if tree.is_sample(c):
+        total_cov[c, c] += this_cov[c, c] * (t_left - last_update[c, c])
+        last_update[c, c] = t_left
+        this_cov[c, c] = root_time - tree.time(c)
+    while p != -1:
+        time = root_time - tree.time(p)
+        for sibling in tree.children(p):
+            if sibling != c:
+                update_cov_all_pairs(
+                    tree, total_cov, this_cov, last_update, leaf, sibling, time, t_left
+                )
+        c, p = p, tree.parent(p)
+
+
+def update_cov_all_pairs(tree, total_cov, this_cov, last_update, c1, c2, time, t_left):
+    s1_ind = tree.left_sample(c1)
+    while True:
+        s2_ind = tree.left_sample(c2)
+        while True:
+            (n1, n2) = (s1_ind, s2_ind)
+            if n1 > n2:
+                (n1, n2) = (n2, n1)
+            total_cov[n1, n2] += this_cov[n1, n2] * (t_left - last_update[n1, n2])
+            last_update[n1, n2] = t_left
+            this_cov[n1, n2] = time
+            if s2_ind == tree.right_sample(c2):
+                break
+            s2_ind = tree.next_sample(s2_ind)
+        if s1_ind == tree.right_sample(c1):
+            break
+        s1_ind = tree.next_sample(s1_ind)
+
+
+class TestCovariance(unittest.TestCase):
+    """
+    Tests on covariance matrix computation
+    """
+
+    def verify(self, ts):
+        cov1 = naive_ts_covariance(ts)
+        cov2 = ts_covariance_incremental(ts)
+        assert np.allclose(cov1, cov2)
+
+    def verify_errors(self, ts):
+        with pytest.raises(ValueError):
+            naive_ts_covariance(ts)
+        with pytest.raises(ValueError):
+            ts_covariance_incremental(ts)
+
+    def test_errors_unary_nodes(self):
+        tables = tskit.TableCollection(sequence_length=2.0)
+
+        sv = [True, False, False]
+        tv = [0.0, 1.0, 2.0]
+        for is_sample, t in zip(sv, tv):
+            flags = tskit.NODE_IS_SAMPLE if is_sample else 0
+            tables.nodes.add_row(flags=flags, time=t)
+
+        lv = [0.0, 0.0, 0.0]
+        rv = [1.0, 1.0, 1.0]
+        pv = [1, 2]
+        cv = [0, 1]
+        for left, right, p, c in zip(lv, rv, pv, cv):
+            tables.edges.add_row(left=left, right=right, parent=p, child=c)
+
+        ts = tables.tree_sequence()
+        self.verify_errors(ts)
+
+    def test_errors_multiroot_tree(self):
+        ts = msprime.simulate(15, random_seed=10)
+        ts = tsutil.decapitate(ts, ts.num_edges // 2)
+        self.verify_errors(ts)
+
+    def test_single_coalescent_tree(self):
+        ts = msprime.simulate(10, random_seed=1, length=10)
+        self.verify(ts)
+
+    def test_coalescent_trees(self):
+        ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2)
+        assert ts.num_trees > 2
+        self.verify(ts)
+
+    def test_internal_samples(self):
+        nodes = io.StringIO(
+            """\
+        id      is_sample   time
+        0       0           0
+        1       1           0.1
+        2       1           0.1
+        3       1           0.2
+        4       0           0.4
+        5       1           0.5
+        6       0           0.7
+        7       0           1.0
+        8       0           0.8
+        """
+        )
+        edges = io.StringIO(
+            """\
+        left    right   parent  child
+        0.0     0.2     4       2,3
+        0.2     0.8     4       0,2
+        0.8     1.0     4       2,3
+        0.0     1.0     5       1,4
+        0.8     1.0     6       0,5
+        0.2     0.8     8       3,5
+        0.0     0.2     7       0,5
+        """
+        )
+        ts = tskit.load_text(nodes=nodes, edges=edges, strict=False)
+        self.verify(ts)
+
+    def validate_trees(self, n):
+        for seed in range(1, 10):
+            ts = msprime.simulate(n, random_seed=seed, recombination_rate=1)
+            self.verify(ts)
+
+    def test_sample_5(self):
+        self.validate_trees(5)
+
+    def test_sample_10(self):
+        self.validate_trees(10)
+
+    def test_sample_20(self):
+        self.validate_trees(20)
+
+    def validate_nonbinary_trees(self, n):
+        demographic_events = [
+            msprime.SimpleBottleneck(0.02, 0, proportion=0.25),
+            msprime.SimpleBottleneck(0.2, 0, proportion=1),
+        ]
+
+        for seed in range(1, 10):
+            ts = msprime.simulate(
+                n,
+                random_seed=seed,
+                demographic_events=demographic_events,
+                recombination_rate=1,
+            )
+            # Check if this is really nonbinary
+            found = False
+            for edgeset in ts.edgesets():
+                if len(edgeset.children) > 2:
+                    found = True
+                    break
+            assert found
+
+            self.verify(ts)
+
+    def test_non_binary_sample_10(self):
+        self.validate_nonbinary_trees(10)
+
+    def test_non_binary_sample_20(self):
+        self.validate_nonbinary_trees(20)
+
+    def test_permit_internal_samples(self):
+        tables = tskit.TableCollection(1.0)
+        tables.nodes.add_row(flags=1)
+        tables.nodes.add_row(flags=1)
+        tables.nodes.add_row(flags=1, time=1)
+        tables.edges.add_row(0, 1, 2, 0)
+        tables.edges.add_row(0, 1, 2, 1)
+        ts = tables.tree_sequence()
+        self.verify(ts)

From 713c4725b16ea9f7897122633f05300a77187797 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Mon, 26 Oct 2020 18:59:28 +0000
Subject: [PATCH 2/9] First go at C implementation of genetic_relatedness

---
 c/tskit/trees.c                 |  51 +++++++++++-
 c/tskit/trees.h                 |   6 +-
 python/_tskitmodule.c           |  10 +++
 python/tests/test_covariance.py | 143 +++++++++++---------------------
 python/tskit/trees.py           |  59 +++++++++++++
 5 files changed, 171 insertions(+), 98 deletions(-)

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index 11d2890f97..c2432b833b 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2752,8 +2752,57 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
     return ret;
 }
 
+relatedness_summary_func(size_t state_dim, const double *state, 
+    size_t result_dim, double *result, void *params)
+{
+    sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
+    const double *x = state;
+    tsk_id_t i, j;
+    size_t k;
+    double c = 0;
+    double sumx = 0;
+    double num = 0;
+
+    for (k = 0; k < state_dim; k++) {
+        sumx += x[k];
+    }
+
+    for (k = 0; k < result_dim; k++) {
+        num += args.sample_set_sizes[k];
+    } 
+
+    if (num != sumx) {
+        c = 1;
+    }
+
+    for (k = 0; k < result_dim; k++) {
+        i = args.set_indexes[2 * k];
+        j = args.set_indexes[2 * k + 1];
+        result[k] = x[i] * x[j] * c;
+    }
+    return 0;
+}
+
+int
+tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
+    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples,
+    const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result,
+    tsk_flags_t options)
+{
+    int ret = 0;
+    ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples);
+    if (ret != 0) {
+        goto out;
+    }
+    ret = tsk_treeseq_sample_count_stat(self, num_sample_sets, sample_set_sizes,
+        sample_sets, num_index_tuples, index_tuples, relatedness_summary_func,
+        num_windows, windows, result, options);
+out:
+    return ret;
+}
+
 static int
-Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim,
+Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, 
     double *result, void *params)
 {
     sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
diff --git a/c/tskit/trees.h b/c/tskit/trees.h
index 8eb8427478..779eeebbd5 100644
--- a/c/tskit/trees.h
+++ b/c/tskit/trees.h
@@ -338,8 +338,6 @@ int tsk_treeseq_allele_frequency_spectrum(const tsk_treeseq_t *self,
     const tsk_id_t *sample_sets, tsk_size_t num_windows, const double *windows,
     double *result, tsk_flags_t options);
 
-/* Two way sample set stats */
-
 typedef int general_sample_stat_method(const tsk_treeseq_t *self,
     tsk_size_t num_sample_sets, const tsk_size_t *sample_set_sizes,
     const tsk_id_t *sample_sets, tsk_size_t num_indexes, const tsk_id_t *indexes,
@@ -357,6 +355,10 @@ int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
     const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
     tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows,
     const double *windows, double *result, tsk_flags_t options);
+int tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
+    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples,
+    const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result,
+    tsk_flags_t options);
 
 /* Three way sample set stats */
 int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c
index 12754d7ea5..f3e21f8acf 100644
--- a/python/_tskitmodule.c
+++ b/python/_tskitmodule.c
@@ -7028,6 +7028,12 @@ TreeSequence_divergence(TreeSequence *self, PyObject *args, PyObject *kwds)
     return TreeSequence_k_way_stat_method(self, args, kwds, 2, tsk_treeseq_divergence);
 }
 
+static PyObject *
+TreeSequence_relatedness(TreeSequence *self, PyObject *args, PyObject *kwds)
+{
+    return TreeSequence_k_way_stat_method(self, args, kwds, 2, tsk_treeseq_relatedness);
+}
+
 static PyObject *
 TreeSequence_Y2(TreeSequence *self, PyObject *args, PyObject *kwds)
 {
@@ -7345,6 +7351,10 @@ static PyMethodDef TreeSequence_methods[] = {
         .ml_meth = (PyCFunction) TreeSequence_divergence,
         .ml_flags = METH_VARARGS | METH_KEYWORDS,
         .ml_doc = "Computes diveregence between sample sets." },
+    { .ml_name = "relatedness",
+        .ml_meth = (PyCFunction) TreeSequence_relatedness,
+        .ml_flags = METH_VARARGS | METH_KEYWORDS,
+        .ml_doc = "Computes genetic relatedness between sample sets." },    
     { .ml_name = "Y1",
         .ml_meth = (PyCFunction) TreeSequence_Y1,
         .ml_flags = METH_VARARGS | METH_KEYWORDS,
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
index f62eb8aa77..6b10a53d1f 100644
--- a/python/tests/test_covariance.py
+++ b/python/tests/test_covariance.py
@@ -38,7 +38,6 @@
 def check_cov_tree_inputs(tree):
     if not len(tree.roots) == 1:
         raise ValueError("Trees must have one root")
-
     for u in tree.nodes():
         if tree.num_children(u) == 1:
             raise ValueError("Unary nodes are not supported")
@@ -52,14 +51,12 @@ def naive_tree_covariance(tree):
     """
     samples = tree.tree_sequence.samples()
     check_cov_tree_inputs(tree)
-
     n = samples.shape[0]
     cov = np.zeros((n, n))
     for n1, n2 in itertools.combinations_with_replacement(range(n), 2):
         mrca = tree.mrca(samples[n1], samples[n2])
         cov[n1, n2] = tree.time(tree.root) - tree.time(mrca)
         cov[n2, n1] = cov[n1, n2]
-
     return cov
 
 
@@ -70,89 +67,34 @@ def naive_ts_covariance(ts):
     tree covariance, with the weights given by the spans of the trees.
     """
     samples = ts.samples()
-
     n = samples.shape[0]
     cov = np.zeros((n, n))
     for tree in ts.trees():
         cov += naive_tree_covariance(tree) * tree.span
-
     return cov / ts.sequence_length
 
 
-def ts_covariance_incremental(ts):
-    for tree in ts.trees():
-        check_cov_tree_inputs(tree)
-
-    n = ts.num_samples
-
-    this_cov = np.zeros((n, n))
-    total_cov = np.zeros((n, n))
-    last_update = np.zeros((n, n))
-
-    for tree, edge_diff in zip(ts.trees(sample_lists=True), ts.edge_diffs()):
-        (t_left, t_right), edges_out, edges_in = edge_diff
-
-        for e in reversed(edges_out):
-            u = e.child
-            if tree.parent(u) == tskit.NULL:
-                for leaf in tree.leaves(u):
-                    update_cov_pairs_with_leaf(
-                        tree, total_cov, this_cov, last_update, leaf, t_left
-                    )
-
-        for e in reversed(edges_in):
-            u = e.child
-            for leaf in tree.leaves(u):
-                update_cov_pairs_with_leaf(
-                    tree, total_cov, this_cov, last_update, leaf, t_left
-                )
-
-    total_cov += this_cov * (t_right - last_update)
-    for (n1, n2) in itertools.combinations(range(n), 2):
-        total_cov[n2, n1] = total_cov[n1, n2]
-    return total_cov / ts.sequence_length
+def genetic_relatedness(ts):
+    # NOTE: I'm outputting a matrix here just for convenience; the proposal
+    # is that the tskit method *not* output a matrix, and use the indices argument
+    sample_sets = [[u] for u in ts.samples()]
+    n = len(sample_sets)
+    num_samples = sum(map(len, sample_sets))
 
+    def f(x):
+        # x[i] gives the number of descendants in sample set i below the branch
+        return np.array(
+            [x[i] * x[j] * (sum(x) != num_samples) for i in range(n) for j in range(n)]
+        )
 
-def update_cov_pairs_with_leaf(tree, total_cov, this_cov, last_update, leaf, t_left):
-    """
-    Perform an upward traversal from `leaf` to the root, updating the covariance
-    matrix elements for pairs of `leaf` with every other leaf in the tree.
-    """
-    root_time = tree.time(tree.root)
-    p = tree.parent(leaf)
-    c = leaf
-
-    if tree.is_sample(c):
-        total_cov[c, c] += this_cov[c, c] * (t_left - last_update[c, c])
-        last_update[c, c] = t_left
-        this_cov[c, c] = root_time - tree.time(c)
-    while p != -1:
-        time = root_time - tree.time(p)
-        for sibling in tree.children(p):
-            if sibling != c:
-                update_cov_all_pairs(
-                    tree, total_cov, this_cov, last_update, leaf, sibling, time, t_left
-                )
-        c, p = p, tree.parent(p)
-
-
-def update_cov_all_pairs(tree, total_cov, this_cov, last_update, c1, c2, time, t_left):
-    s1_ind = tree.left_sample(c1)
-    while True:
-        s2_ind = tree.left_sample(c2)
-        while True:
-            (n1, n2) = (s1_ind, s2_ind)
-            if n1 > n2:
-                (n1, n2) = (n2, n1)
-            total_cov[n1, n2] += this_cov[n1, n2] * (t_left - last_update[n1, n2])
-            last_update[n1, n2] = t_left
-            this_cov[n1, n2] = time
-            if s2_ind == tree.right_sample(c2):
-                break
-            s2_ind = tree.next_sample(s2_ind)
-        if s1_ind == tree.right_sample(c1):
-            break
-        s1_ind = tree.next_sample(s1_ind)
+    return ts.sample_count_stat(
+        sample_sets,
+        f,
+        output_dim=n * n,
+        mode="branch",
+        span_normalise=True,
+        polarised=True,
+    ).reshape((n, n))
 
 
 class TestCovariance(unittest.TestCase):
@@ -162,33 +104,44 @@ class TestCovariance(unittest.TestCase):
 
     def verify(self, ts):
         cov1 = naive_ts_covariance(ts)
-        cov2 = ts_covariance_incremental(ts)
+        cov2 = genetic_relatedness(ts)
+        sample_sets = [[u] for u in ts.samples()]
+        n = len(sample_sets)
+        indexes = [
+            (n1, n2) for n1, n2 in itertools.combinations_with_replacement(range(n), 2)
+        ]
+        cov3 = np.zeros((n, n))
+        i_upper = np.triu_indices(n)
+        cov3[i_upper] = ts.genetic_relatedness(
+            sample_sets, indexes, mode="branch", span_normalise=True
+        )
+        cov3 = np.maximum(cov3, cov3.transpose())
+        # cov2 = ts_covariance_incremental(ts)
         assert np.allclose(cov1, cov2)
+        assert np.allclose(cov1, cov3)
 
     def verify_errors(self, ts):
         with pytest.raises(ValueError):
             naive_ts_covariance(ts)
-        with pytest.raises(ValueError):
-            ts_covariance_incremental(ts)
 
-    def test_errors_unary_nodes(self):
-        tables = tskit.TableCollection(sequence_length=2.0)
+    # def test_errors_unary_nodes(self):
+    #     tables = tskit.TableCollection(sequence_length=2.0)
 
-        sv = [True, False, False]
-        tv = [0.0, 1.0, 2.0]
-        for is_sample, t in zip(sv, tv):
-            flags = tskit.NODE_IS_SAMPLE if is_sample else 0
-            tables.nodes.add_row(flags=flags, time=t)
+    #     sv = [True, False, False]
+    #     tv = [0.0, 1.0, 2.0]
+    #     for is_sample, t in zip(sv, tv):
+    #         flags = tskit.NODE_IS_SAMPLE if is_sample else 0
+    #         tables.nodes.add_row(flags=flags, time=t)
 
-        lv = [0.0, 0.0, 0.0]
-        rv = [1.0, 1.0, 1.0]
-        pv = [1, 2]
-        cv = [0, 1]
-        for left, right, p, c in zip(lv, rv, pv, cv):
-            tables.edges.add_row(left=left, right=right, parent=p, child=c)
+    #     lv = [0.0, 0.0, 0.0]
+    #     rv = [1.0, 1.0, 1.0]
+    #     pv = [1, 2]
+    #     cv = [0, 1]
+    #     for left, right, p, c in zip(lv, rv, pv, cv):
+    #         tables.edges.add_row(left=left, right=right, parent=p, child=c)
 
-        ts = tables.tree_sequence()
-        self.verify_errors(ts)
+    #     ts = tables.tree_sequence()
+    #     self.verify_errors(ts)
 
     def test_errors_multiroot_tree(self):
         ts = msprime.simulate(15, random_seed=10)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 3e597148ce..e3a68696c6 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5496,6 +5496,65 @@ def divergence(
     #                 k += 1
     #     return A
 
+    def genetic_relatedness(
+        self, sample_sets, indexes=None, windows=None, mode="site", span_normalise=True
+    ):
+        """
+        Computes genetic relatedness between (and within) pairs of
+        sets of nodes from ``sample_sets``.
+        Operates on ``k = 2`` sample sets at a time; please see the
+        :ref:`multi-way statistics <sec_stats_sample_sets_multi_way>`
+        section for details on how the ``sample_sets`` and ``indexes`` arguments are
+        interpreted and how they interact with the dimensions of the output array.
+        See the :ref:`statistics interface <sec_stats_interface>` section for details on
+        :ref:`windows <sec_stats_windows>`,
+        :ref:`mode <sec_stats_mode>`,
+        :ref:`span normalise <sec_stats_span_normalise>`,
+        and :ref:`return value <sec_stats_output_format>`.
+
+        As a special case, an index ``(j, j)`` will compute the
+        :meth:`diversity <.TreeSequence.diversity>` of ``sample_set[j]``.
+
+        What is computed depends on ``mode``:
+
+        "site"
+            Mean pairwise genetic divergence: the average across distinct,
+            randomly chosen pairs of chromosomes (one from each sample set), of
+            the density of sites at which the two carry different alleles, per
+            unit of chromosome length.
+
+        "branch"
+            Mean distance in the tree: the average across distinct, randomly
+            chosen pairs of chromsomes (one from each sample set) and locations
+            in the window, of the mean distance in the tree between the two
+            samples (in units of time).
+
+        "node"
+            For each node, the proportion of genome on which the node is an ancestor to
+            only one of a random pair (one from each sample set), averaged over
+            choices of pair.
+
+        :param list sample_sets: A list of lists of Node IDs, specifying the
+            groups of nodes to compute the statistic with.
+        :param list indexes: A list of 2-tuples, or None.
+        :param list windows: An increasing list of breakpoints between the windows
+            to compute the statistic in.
+        :param str mode: A string giving the "type" of the statistic to be computed
+            (defaults to "site").
+        :param bool span_normalise: Whether to divide the result by the span of the
+            window (defaults to True).
+        :return: A ndarray with shape equal to (num windows, num statistics).
+        """
+        return self.__k_way_sample_set_stat(
+            self._ll_tree_sequence.relatedness,
+            2,
+            sample_sets,
+            indexes=indexes,
+            windows=windows,
+            mode=mode,
+            span_normalise=span_normalise,
+        )
+
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):
         """
         Computes the mean squared covariances between each of the columns of ``W``

From 9373782d6d1f60da1e6ac395c3bda478a6f3b07e Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Wed, 28 Oct 2020 10:58:02 +0000
Subject: [PATCH 3/9] Write c-like summary function in python

---
 c/tskit/trees.c                 |  8 +++++--
 python/tests/test_covariance.py | 37 ++++++++++++++++++++++++++++++++-
 2 files changed, 42 insertions(+), 3 deletions(-)

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index c2432b833b..4a6e3861f9 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2759,7 +2759,7 @@ relatedness_summary_func(size_t state_dim, const double *state,
     const double *x = state;
     tsk_id_t i, j;
     size_t k;
-    double c = 0;
+    int c = 0;
     double sumx = 0;
     double num = 0;
 
@@ -2767,7 +2767,7 @@ relatedness_summary_func(size_t state_dim, const double *state,
         sumx += x[k];
     }
 
-    for (k = 0; k < result_dim; k++) {
+    for (k = 0; k < state_dim; k++) {
         num += args.sample_set_sizes[k];
     } 
 
@@ -2775,6 +2775,10 @@ relatedness_summary_func(size_t state_dim, const double *state,
         c = 1;
     }
 
+    // printf("num = %0.7f\n", num);
+    // printf("sumx = %0.7f\n", sumx);
+    // printf("c = %d\n", c);
+
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
index 6b10a53d1f..5ed858d4dc 100644
--- a/python/tests/test_covariance.py
+++ b/python/tests/test_covariance.py
@@ -78,6 +78,7 @@ def genetic_relatedness(ts):
     # NOTE: I'm outputting a matrix here just for convenience; the proposal
     # is that the tskit method *not* output a matrix, and use the indices argument
     sample_sets = [[u] for u in ts.samples()]
+    # sample_sets = [[0], [1]]
     n = len(sample_sets)
     num_samples = sum(map(len, sample_sets))
 
@@ -97,6 +98,37 @@ def f(x):
     ).reshape((n, n))
 
 
+def c_genetic_relatedness(ts, sample_sets, indexes):
+    n = len(indexes)
+    state_dim = len(sample_sets)
+
+    def f(x):
+        # x[i] gives the number of descendants in sample set i below the branch
+        sumx = 0
+        num = 0
+        c = 0
+        for k in range(state_dim):
+            sumx += x[k]
+
+        for k in range(state_dim):
+            num += len(sample_sets[k])
+
+        if num != sumx:
+            c = 1
+
+        result = np.zeros(n)
+        for k in range(n):
+            i = indexes[k][0]
+            j = indexes[k][1]
+            result[k] = x[i] * x[j] * c
+
+        return result
+
+    return ts.sample_count_stat(
+        sample_sets, f, output_dim=n, mode="branch", span_normalise=True, polarised=True
+    )
+
+
 class TestCovariance(unittest.TestCase):
     """
     Tests on covariance matrix computation
@@ -111,13 +143,16 @@ def verify(self, ts):
             (n1, n2) for n1, n2 in itertools.combinations_with_replacement(range(n), 2)
         ]
         cov3 = np.zeros((n, n))
+        cov4 = np.zeros((n, n))
         i_upper = np.triu_indices(n)
         cov3[i_upper] = ts.genetic_relatedness(
             sample_sets, indexes, mode="branch", span_normalise=True
         )
+        cov4[i_upper] = c_genetic_relatedness(ts, sample_sets, indexes)
         cov3 = np.maximum(cov3, cov3.transpose())
-        # cov2 = ts_covariance_incremental(ts)
+        cov4 = np.maximum(cov4, cov4.transpose())
         assert np.allclose(cov1, cov2)
+        assert np.allclose(cov1, cov4)
         assert np.allclose(cov1, cov3)
 
     def verify_errors(self, ts):

From 491bf4b09c837b9b78c72839df47417075fce916 Mon Sep 17 00:00:00 2001
From: Jerome Kelleher <jk@well.ox.ac.uk>
Date: Wed, 28 Oct 2020 14:56:24 +0000
Subject: [PATCH 4/9] Add polarised option to stats infrastructure.

---
 c/tskit/trees.c                 |  3 ++-
 python/_tskitmodule.c           | 14 +++++++++-----
 python/tests/test_covariance.py |  1 +
 python/tskit/trees.py           |  6 +++---
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index 4a6e3861f9..b6432dacb4 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2752,6 +2752,7 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
     return ret;
 }
 
+static int
 relatedness_summary_func(size_t state_dim, const double *state, 
     size_t result_dim, double *result, void *params)
 {
@@ -2769,7 +2770,7 @@ relatedness_summary_func(size_t state_dim, const double *state,
 
     for (k = 0; k < state_dim; k++) {
         num += args.sample_set_sizes[k];
-    } 
+    }
 
     if (num != sumx) {
         c = 1;
diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c
index f3e21f8acf..bfc695ac65 100644
--- a/python/_tskitmodule.c
+++ b/python/_tskitmodule.c
@@ -6946,7 +6946,7 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd
 {
     PyObject *ret = NULL;
     static char *kwlist[] = { "sample_set_sizes", "sample_sets", "indexes", "windows",
-        "mode", "span_normalise", NULL };
+        "mode", "span_normalise", "polarised", NULL };
     PyObject *sample_set_sizes = NULL;
     PyObject *sample_sets = NULL;
     PyObject *indexes = NULL;
@@ -6960,14 +6960,15 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd
     npy_intp *shape;
     tsk_flags_t options = 0;
     char *mode = NULL;
-    int span_normalise = 1;
+    int span_normalise = true;
+    int polarised = false;
     int err;
 
     if (TreeSequence_check_tree_sequence(self) != 0) {
         goto out;
     }
-    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOO|si", kwlist, &sample_set_sizes,
-            &sample_sets, &indexes, &windows, &mode, &span_normalise)) {
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "OOOO|sii", kwlist, &sample_set_sizes,
+            &sample_sets, &indexes, &windows, &mode, &span_normalise, &polarised)) {
         goto out;
     }
     if (parse_stats_mode(mode, &options) != 0) {
@@ -6976,6 +6977,9 @@ TreeSequence_k_way_stat_method(TreeSequence *self, PyObject *args, PyObject *kwd
     if (span_normalise) {
         options |= TSK_STAT_SPAN_NORMALISE;
     }
+    if (polarised) {
+        options |= TSK_STAT_POLARISED;
+    }
     if (parse_sample_sets(sample_set_sizes, &sample_set_sizes_array, sample_sets,
             &sample_sets_array, &num_sample_sets)
         != 0) {
@@ -7354,7 +7358,7 @@ static PyMethodDef TreeSequence_methods[] = {
     { .ml_name = "relatedness",
         .ml_meth = (PyCFunction) TreeSequence_relatedness,
         .ml_flags = METH_VARARGS | METH_KEYWORDS,
-        .ml_doc = "Computes genetic relatedness between sample sets." },    
+        .ml_doc = "Computes genetic relatedness between sample sets." },
     { .ml_name = "Y1",
         .ml_meth = (PyCFunction) TreeSequence_Y1,
         .ml_flags = METH_VARARGS | METH_KEYWORDS,
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
index 5ed858d4dc..f2c88c1b7b 100644
--- a/python/tests/test_covariance.py
+++ b/python/tests/test_covariance.py
@@ -192,6 +192,7 @@ def test_coalescent_trees(self):
         assert ts.num_trees > 2
         self.verify(ts)
 
+    @pytest.mark.skip(reason="internal samples not working")
     def test_internal_samples(self):
         nodes = io.StringIO(
             """\
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index e3a68696c6..fe6d7350b8 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5308,6 +5308,7 @@ def __k_way_sample_set_stat(
         windows=None,
         mode=None,
         span_normalise=True,
+        polarised=False,
     ):
         sample_set_sizes = np.array(
             [len(sample_set) for sample_set in sample_sets], dtype=np.uint32
@@ -5340,6 +5341,7 @@ def __k_way_sample_set_stat(
             indexes,
             mode=mode,
             span_normalise=span_normalise,
+            polarised=polarised,
         )
         if drop_dimension:
             stat = stat.reshape(stat.shape[:-1])
@@ -5512,9 +5514,6 @@ def genetic_relatedness(
         :ref:`span normalise <sec_stats_span_normalise>`,
         and :ref:`return value <sec_stats_output_format>`.
 
-        As a special case, an index ``(j, j)`` will compute the
-        :meth:`diversity <.TreeSequence.diversity>` of ``sample_set[j]``.
-
         What is computed depends on ``mode``:
 
         "site"
@@ -5553,6 +5552,7 @@ def genetic_relatedness(
             windows=windows,
             mode=mode,
             span_normalise=span_normalise,
+            polarised=True,
         )
 
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):

From dffe79b52b72bb3b6ac29f2ab7c94f75d7764fdf Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Fri, 30 Oct 2020 11:54:49 +0000
Subject: [PATCH 5/9] Change implementation to be in line with genotype
 covariance

---
 c/tests/test_stats.c            |  12 +++
 c/tskit/trees.c                 |  10 +--
 python/tests/test_covariance.py | 154 +++++++++++++++++++-------------
 python/tskit/trees.py           |   2 +-
 4 files changed, 108 insertions(+), 70 deletions(-)

diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c
index 63ac3427cf..89138efcea 100644
--- a/c/tests/test_stats.c
+++ b/c/tests/test_stats.c
@@ -1333,6 +1333,17 @@ test_paper_ex_divergence(void)
     tsk_treeseq_free(&ts);
 }
 
+static void
+test_paper_ex_relatedness_errors(void)
+{
+    tsk_treeseq_t ts;
+
+    tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
+        paper_ex_mutations, paper_ex_individuals, NULL, 0);
+    verify_two_way_stat_func_errors(&ts, tsk_treeseq_relatedness);
+    tsk_treeseq_free(&ts);
+}
+
 static void
 test_paper_ex_Y2_errors(void)
 {
@@ -1679,6 +1690,7 @@ main(int argc, char **argv)
         { "test_paper_ex_Y1", test_paper_ex_Y1 },
         { "test_paper_ex_divergence_errors", test_paper_ex_divergence_errors },
         { "test_paper_ex_divergence", test_paper_ex_divergence },
+        { "test_paper_ex_relatedness_errors", test_paper_ex_relatedness_errors },
         { "test_paper_ex_Y2_errors", test_paper_ex_Y2_errors },
         { "test_paper_ex_Y2", test_paper_ex_Y2 },
         { "test_paper_ex_f2_errors", test_paper_ex_f2_errors },
diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index b6432dacb4..c30e0cb16c 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2762,6 +2762,7 @@ relatedness_summary_func(size_t state_dim, const double *state,
     size_t k;
     int c = 0;
     double sumx = 0;
+    double meanx;
     double num = 0;
 
     for (k = 0; k < state_dim; k++) {
@@ -2775,15 +2776,12 @@ relatedness_summary_func(size_t state_dim, const double *state,
     if (num != sumx) {
         c = 1;
     }
-
-    // printf("num = %0.7f\n", num);
-    // printf("sumx = %0.7f\n", sumx);
-    // printf("c = %d\n", c);
-
+    meanx = sumx / (double) state_dim;
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
-        result[k] = x[i] * x[j] * c;
+        // result[k] = x[i] * x[j] * c;
+        result[k] = (x[i] - meanx) * (x[j] - meanx);
     }
     return 0;
 }
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
index f2c88c1b7b..cb1a1da124 100644
--- a/python/tests/test_covariance.py
+++ b/python/tests/test_covariance.py
@@ -74,7 +74,14 @@ def naive_ts_covariance(ts):
     return cov / ts.sequence_length
 
 
-def genetic_relatedness(ts):
+def naive_genotype_covariance(ts):
+    G = ts.genotype_matrix()
+    # p = G.shape[0]
+    G = G.T - np.mean(G, axis=1)
+    return G @ G.T  # / p
+
+
+def genetic_relatedness(ts, mode="site", polarised=True):
     # NOTE: I'm outputting a matrix here just for convenience; the proposal
     # is that the tskit method *not* output a matrix, and use the indices argument
     sample_sets = [[u] for u in ts.samples()]
@@ -92,40 +99,62 @@ def f(x):
         sample_sets,
         f,
         output_dim=n * n,
-        mode="branch",
+        mode=mode,
         span_normalise=True,
-        polarised=True,
+        polarised=polarised,
     ).reshape((n, n))
 
 
-def c_genetic_relatedness(ts, sample_sets, indexes):
-    n = len(indexes)
+def genotype_relatedness(ts, polarised=False):
+    n = ts.num_samples
+    sample_sets = [[u] for u in ts.samples()]
+
+    def f(x):
+        return np.array(
+            [
+                (x[i] - sum(x) / n) * (x[j] - sum(x) / n)
+                for i in range(n)
+                for j in range(n)
+            ]
+        )
+
+    return (
+        ts.sample_count_stat(
+            sample_sets,
+            f,
+            output_dim=n * n,
+            mode="site",
+            span_normalise=False,
+            polarised=polarised,
+        ).reshape((n, n))
+        / 2
+    )
+
+
+def c_genotype_relatedness(ts, sample_sets, indexes):
+    m = len(indexes)
     state_dim = len(sample_sets)
 
     def f(x):
-        # x[i] gives the number of descendants in sample set i below the branch
         sumx = 0
-        num = 0
-        c = 0
         for k in range(state_dim):
             sumx += x[k]
-
-        for k in range(state_dim):
-            num += len(sample_sets[k])
-
-        if num != sumx:
-            c = 1
-
-        result = np.zeros(n)
-        for k in range(n):
+        meanx = sumx / state_dim
+        result = np.zeros(m)
+        for k in range(m):
             i = indexes[k][0]
             j = indexes[k][1]
-            result[k] = x[i] * x[j] * c
-
+            result[k] = (x[i] - meanx) * (x[j] - meanx) / 2
         return result
 
     return ts.sample_count_stat(
-        sample_sets, f, output_dim=n, mode="branch", span_normalise=True, polarised=True
+        sample_sets,
+        f,
+        output_dim=m,
+        mode="site",
+        span_normalise=False,
+        polarised=False,
+        strict=False,
     )
 
 
@@ -135,8 +164,10 @@ class TestCovariance(unittest.TestCase):
     """
 
     def verify(self, ts):
-        cov1 = naive_ts_covariance(ts)
-        cov2 = genetic_relatedness(ts)
+        # cov1 = naive_ts_covariance(ts)
+        cov1 = naive_genotype_covariance(ts)
+        # cov2 = genetic_relatedness(ts)
+        cov2 = genotype_relatedness(ts)
         sample_sets = [[u] for u in ts.samples()]
         n = len(sample_sets)
         indexes = [
@@ -145,12 +176,16 @@ def verify(self, ts):
         cov3 = np.zeros((n, n))
         cov4 = np.zeros((n, n))
         i_upper = np.triu_indices(n)
-        cov3[i_upper] = ts.genetic_relatedness(
-            sample_sets, indexes, mode="branch", span_normalise=True
-        )
-        cov4[i_upper] = c_genetic_relatedness(ts, sample_sets, indexes)
-        cov3 = np.maximum(cov3, cov3.transpose())
-        cov4 = np.maximum(cov4, cov4.transpose())
+        cov3[i_upper] = (
+            ts.genetic_relatedness(
+                sample_sets, indexes, mode="site", span_normalise=False
+            )
+            / 2
+        )  # NOTE: divided by 2 to reflect unpolarised
+        cov3 = cov3 + cov3.T - np.diag(cov3.diagonal())
+        cov4[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes)
+        cov4 = cov4 + cov4.T - np.diag(cov4.diagonal())
+        # assert np.allclose(cov2, cov3)
         assert np.allclose(cov1, cov2)
         assert np.allclose(cov1, cov4)
         assert np.allclose(cov1, cov3)
@@ -159,40 +194,22 @@ def verify_errors(self, ts):
         with pytest.raises(ValueError):
             naive_ts_covariance(ts)
 
-    # def test_errors_unary_nodes(self):
-    #     tables = tskit.TableCollection(sequence_length=2.0)
-
-    #     sv = [True, False, False]
-    #     tv = [0.0, 1.0, 2.0]
-    #     for is_sample, t in zip(sv, tv):
-    #         flags = tskit.NODE_IS_SAMPLE if is_sample else 0
-    #         tables.nodes.add_row(flags=flags, time=t)
-
-    #     lv = [0.0, 0.0, 0.0]
-    #     rv = [1.0, 1.0, 1.0]
-    #     pv = [1, 2]
-    #     cv = [0, 1]
-    #     for left, right, p, c in zip(lv, rv, pv, cv):
-    #         tables.edges.add_row(left=left, right=right, parent=p, child=c)
-
-    #     ts = tables.tree_sequence()
-    #     self.verify_errors(ts)
-
     def test_errors_multiroot_tree(self):
-        ts = msprime.simulate(15, random_seed=10)
+        ts = msprime.simulate(15, random_seed=10, mutation_rate=1)
         ts = tsutil.decapitate(ts, ts.num_edges // 2)
         self.verify_errors(ts)
 
     def test_single_coalescent_tree(self):
-        ts = msprime.simulate(10, random_seed=1, length=10)
+        ts = msprime.simulate(10, random_seed=1, length=10, mutation_rate=1)
         self.verify(ts)
 
     def test_coalescent_trees(self):
-        ts = msprime.simulate(8, recombination_rate=5, random_seed=1, length=2)
+        ts = msprime.simulate(
+            8, recombination_rate=5, random_seed=1, length=2, mutation_rate=1
+        )
         assert ts.num_trees > 2
         self.verify(ts)
 
-    @pytest.mark.skip(reason="internal samples not working")
     def test_internal_samples(self):
         nodes = io.StringIO(
             """\
@@ -220,12 +237,32 @@ def test_internal_samples(self):
         0.0     0.2     7       0,5
         """
         )
-        ts = tskit.load_text(nodes=nodes, edges=edges, strict=False)
+        sites = io.StringIO(
+            """\
+        position    ancestral_state
+        0.1         0
+        0.5         0
+        0.9         0
+        """
+        )
+        mutations = io.StringIO(
+            """\
+        site    node    derived_state
+        0       1       1
+        1       3       1
+        2       5       1
+        """
+        )
+        ts = tskit.load_text(
+            nodes=nodes, edges=edges, sites=sites, mutations=mutations, strict=False
+        )
         self.verify(ts)
 
     def validate_trees(self, n):
         for seed in range(1, 10):
-            ts = msprime.simulate(n, random_seed=seed, recombination_rate=1)
+            ts = msprime.simulate(
+                n, random_seed=seed, recombination_rate=1, mutation_rate=1
+            )
             self.verify(ts)
 
     def test_sample_5(self):
@@ -249,6 +286,7 @@ def validate_nonbinary_trees(self, n):
                 random_seed=seed,
                 demographic_events=demographic_events,
                 recombination_rate=1,
+                mutation_rate=5,
             )
             # Check if this is really nonbinary
             found = False
@@ -265,13 +303,3 @@ def test_non_binary_sample_10(self):
 
     def test_non_binary_sample_20(self):
         self.validate_nonbinary_trees(20)
-
-    def test_permit_internal_samples(self):
-        tables = tskit.TableCollection(1.0)
-        tables.nodes.add_row(flags=1)
-        tables.nodes.add_row(flags=1)
-        tables.nodes.add_row(flags=1, time=1)
-        tables.edges.add_row(0, 1, 2, 0)
-        tables.edges.add_row(0, 1, 2, 1)
-        ts = tables.tree_sequence()
-        self.verify(ts)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index fe6d7350b8..20edc31807 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5552,7 +5552,7 @@ def genetic_relatedness(
             windows=windows,
             mode=mode,
             span_normalise=span_normalise,
-            polarised=True,
+            polarised=False,
         )
 
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):

From 5e657b2768ca19b659719fa3431641d1270b5fe8 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Tue, 3 Nov 2020 14:11:31 +0000
Subject: [PATCH 6/9] Adjust tests and change default arguments

Add hack to divide by 2 - polarised in C implementation
---
 c/tskit/trees.c                 |  10 ---
 python/tests/test_covariance.py | 139 +++++++++-----------------------
 python/tskit/trees.py           |  52 ++++++------
 3 files changed, 61 insertions(+), 140 deletions(-)

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index c30e0cb16c..19fce24487 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2760,27 +2760,17 @@ relatedness_summary_func(size_t state_dim, const double *state,
     const double *x = state;
     tsk_id_t i, j;
     size_t k;
-    int c = 0;
     double sumx = 0;
     double meanx;
-    double num = 0;
 
     for (k = 0; k < state_dim; k++) {
         sumx += x[k];
     }
 
-    for (k = 0; k < state_dim; k++) {
-        num += args.sample_set_sizes[k];
-    }
-
-    if (num != sumx) {
-        c = 1;
-    }
     meanx = sumx / (double) state_dim;
     for (k = 0; k < result_dim; k++) {
         i = args.set_indexes[2 * k];
         j = args.set_indexes[2 * k + 1];
-        // result[k] = x[i] * x[j] * c;
         result[k] = (x[i] - meanx) * (x[j] - meanx);
     }
     return 0;
diff --git a/python/tests/test_covariance.py b/python/tests/test_covariance.py
index cb1a1da124..11891cc29e 100644
--- a/python/tests/test_covariance.py
+++ b/python/tests/test_covariance.py
@@ -29,83 +29,22 @@
 
 import msprime
 import numpy as np
-import pytest
 
-import tests.tsutil as tsutil
 import tskit
 
 
-def check_cov_tree_inputs(tree):
-    if not len(tree.roots) == 1:
-        raise ValueError("Trees must have one root")
-    for u in tree.nodes():
-        if tree.num_children(u) == 1:
-            raise ValueError("Unary nodes are not supported")
-
-
-def naive_tree_covariance(tree):
-    """
-    Returns the (branch) covariance matrix for the sample nodes in a tree. The
-    covariance between a pair of nodes is the distance from the root to their
-    most recent common ancestor.
-    """
-    samples = tree.tree_sequence.samples()
-    check_cov_tree_inputs(tree)
-    n = samples.shape[0]
-    cov = np.zeros((n, n))
-    for n1, n2 in itertools.combinations_with_replacement(range(n), 2):
-        mrca = tree.mrca(samples[n1], samples[n2])
-        cov[n1, n2] = tree.time(tree.root) - tree.time(mrca)
-        cov[n2, n1] = cov[n1, n2]
-    return cov
-
-
-def naive_ts_covariance(ts):
-    """
-    Returns the (branch) covariance matrix for the sample nodes in a tree
-    sequence. The covariance between a pair of nodes is the weighted sum of the
-    tree covariance, with the weights given by the spans of the trees.
-    """
-    samples = ts.samples()
-    n = samples.shape[0]
-    cov = np.zeros((n, n))
-    for tree in ts.trees():
-        cov += naive_tree_covariance(tree) * tree.span
-    return cov / ts.sequence_length
-
-
-def naive_genotype_covariance(ts):
+def naive_genotype_covariance(ts, proportion=False):
     G = ts.genotype_matrix()
-    # p = G.shape[0]
+    denominator = ts.sequence_length
+    if proportion:
+        all_samples = ts.samples()
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
     G = G.T - np.mean(G, axis=1)
-    return G @ G.T  # / p
-
-
-def genetic_relatedness(ts, mode="site", polarised=True):
-    # NOTE: I'm outputting a matrix here just for convenience; the proposal
-    # is that the tskit method *not* output a matrix, and use the indices argument
-    sample_sets = [[u] for u in ts.samples()]
-    # sample_sets = [[0], [1]]
-    n = len(sample_sets)
-    num_samples = sum(map(len, sample_sets))
-
-    def f(x):
-        # x[i] gives the number of descendants in sample set i below the branch
-        return np.array(
-            [x[i] * x[j] * (sum(x) != num_samples) for i in range(n) for j in range(n)]
-        )
+    return G @ G.T / denominator
 
-    return ts.sample_count_stat(
-        sample_sets,
-        f,
-        output_dim=n * n,
-        mode=mode,
-        span_normalise=True,
-        polarised=polarised,
-    ).reshape((n, n))
 
-
-def genotype_relatedness(ts, polarised=False):
+def genotype_relatedness(ts, polarised=False, proportion=False):
     n = ts.num_samples
     sample_sets = [[u] for u in ts.samples()]
 
@@ -118,20 +57,25 @@ def f(x):
             ]
         )
 
+    denominator = 2 - polarised
+    if proportion:
+        all_samples = list({u for s in sample_sets for u in s})
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
     return (
         ts.sample_count_stat(
             sample_sets,
             f,
             output_dim=n * n,
             mode="site",
-            span_normalise=False,
+            span_normalise=True,
             polarised=polarised,
         ).reshape((n, n))
-        / 2
+        / denominator
     )
 
 
-def c_genotype_relatedness(ts, sample_sets, indexes):
+def c_genotype_relatedness(ts, sample_sets, indexes, polarised=False, proportion=False):
     m = len(indexes)
     state_dim = len(sample_sets)
 
@@ -144,17 +88,25 @@ def f(x):
         for k in range(m):
             i = indexes[k][0]
             j = indexes[k][1]
-            result[k] = (x[i] - meanx) * (x[j] - meanx) / 2
+            result[k] = (x[i] - meanx) * (x[j] - meanx)
         return result
 
-    return ts.sample_count_stat(
-        sample_sets,
-        f,
-        output_dim=m,
-        mode="site",
-        span_normalise=False,
-        polarised=False,
-        strict=False,
+    denominator = 2 - polarised
+    if proportion:
+        all_samples = list({u for s in sample_sets for u in s})
+        num = ts.segregating_sites(all_samples)
+        denominator = denominator * num
+    return (
+        ts.sample_count_stat(
+            sample_sets,
+            f,
+            output_dim=m,
+            mode="site",
+            span_normalise=True,
+            polarised=False,
+            strict=False,
+        )
+        / denominator
     )
 
 
@@ -164,9 +116,7 @@ class TestCovariance(unittest.TestCase):
     """
 
     def verify(self, ts):
-        # cov1 = naive_ts_covariance(ts)
         cov1 = naive_genotype_covariance(ts)
-        # cov2 = genetic_relatedness(ts)
         cov2 = genotype_relatedness(ts)
         sample_sets = [[u] for u in ts.samples()]
         n = len(sample_sets)
@@ -176,28 +126,15 @@ def verify(self, ts):
         cov3 = np.zeros((n, n))
         cov4 = np.zeros((n, n))
         i_upper = np.triu_indices(n)
-        cov3[i_upper] = (
-            ts.genetic_relatedness(
-                sample_sets, indexes, mode="site", span_normalise=False
-            )
-            / 2
-        )  # NOTE: divided by 2 to reflect unpolarised
+        cov3[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes)
         cov3 = cov3 + cov3.T - np.diag(cov3.diagonal())
-        cov4[i_upper] = c_genotype_relatedness(ts, sample_sets, indexes)
+        cov4[i_upper] = ts.genetic_relatedness(
+            sample_sets, indexes, mode="site", span_normalise=True
+        )
         cov4 = cov4 + cov4.T - np.diag(cov4.diagonal())
-        # assert np.allclose(cov2, cov3)
         assert np.allclose(cov1, cov2)
-        assert np.allclose(cov1, cov4)
         assert np.allclose(cov1, cov3)
-
-    def verify_errors(self, ts):
-        with pytest.raises(ValueError):
-            naive_ts_covariance(ts)
-
-    def test_errors_multiroot_tree(self):
-        ts = msprime.simulate(15, random_seed=10, mutation_rate=1)
-        ts = tsutil.decapitate(ts, ts.num_edges // 2)
-        self.verify_errors(ts)
+        assert np.allclose(cov1, cov4)
 
     def test_single_coalescent_tree(self):
         ts = msprime.simulate(10, random_seed=1, length=10, mutation_rate=1)
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 20edc31807..0fb80ec4c1 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5499,7 +5499,14 @@ def divergence(
     #     return A
 
     def genetic_relatedness(
-        self, sample_sets, indexes=None, windows=None, mode="site", span_normalise=True
+        self,
+        sample_sets,
+        indexes=None,
+        windows=None,
+        mode="site",
+        span_normalise=True,
+        polarised=False,
+        proportion=True,
     ):
         """
         Computes genetic relatedness between (and within) pairs of
@@ -5512,27 +5519,9 @@ def genetic_relatedness(
         :ref:`windows <sec_stats_windows>`,
         :ref:`mode <sec_stats_mode>`,
         :ref:`span normalise <sec_stats_span_normalise>`,
+        :ref:`polarised <sec_stats_polarisation>`,
         and :ref:`return value <sec_stats_output_format>`.
 
-        What is computed depends on ``mode``:
-
-        "site"
-            Mean pairwise genetic divergence: the average across distinct,
-            randomly chosen pairs of chromosomes (one from each sample set), of
-            the density of sites at which the two carry different alleles, per
-            unit of chromosome length.
-
-        "branch"
-            Mean distance in the tree: the average across distinct, randomly
-            chosen pairs of chromsomes (one from each sample set) and locations
-            in the window, of the mean distance in the tree between the two
-            samples (in units of time).
-
-        "node"
-            For each node, the proportion of genome on which the node is an ancestor to
-            only one of a random pair (one from each sample set), averaged over
-            choices of pair.
-
         :param list sample_sets: A list of lists of Node IDs, specifying the
             groups of nodes to compute the statistic with.
         :param list indexes: A list of 2-tuples, or None.
@@ -5542,17 +5531,22 @@ def genetic_relatedness(
             (defaults to "site").
         :param bool span_normalise: Whether to divide the result by the span of the
             window (defaults to True).
+        :param bool proportion: Whether to divide the result by the number of
+            segregating sites (defaults to True).
         :return: A ndarray with shape equal to (num windows, num statistics).
         """
-        return self.__k_way_sample_set_stat(
-            self._ll_tree_sequence.relatedness,
-            2,
-            sample_sets,
-            indexes=indexes,
-            windows=windows,
-            mode=mode,
-            span_normalise=span_normalise,
-            polarised=False,
+        return (
+            self.__k_way_sample_set_stat(
+                self._ll_tree_sequence.relatedness,
+                2,
+                sample_sets,
+                indexes=indexes,
+                windows=windows,
+                mode=mode,
+                span_normalise=span_normalise,
+                polarised=polarised,
+            )
+            / (2 - polarised)
         )
 
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):

From 9c49e1c2f68ed91c828cd2247061fd52bf0c0822 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Tue, 3 Nov 2020 18:11:25 +0000
Subject: [PATCH 7/9] Remove polarised in divisor

---
 python/tskit/trees.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 0fb80ec4c1..741597ff35 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -5546,7 +5546,7 @@ def genetic_relatedness(
                 span_normalise=span_normalise,
                 polarised=polarised,
             )
-            / (2 - polarised)
+            / 2
         )
 
     def trait_covariance(self, W, windows=None, mode="site", span_normalise=True):

From 72f901c279a3e70d68d4ce92c7405e8ef8c88bf4 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Tue, 3 Nov 2020 18:58:52 +0000
Subject: [PATCH 8/9] Lint c files

---
 c/tskit/trees.c | 12 ++++++------
 c/tskit/trees.h |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/c/tskit/trees.c b/c/tskit/trees.c
index 19fce24487..4a130c0d47 100644
--- a/c/tskit/trees.c
+++ b/c/tskit/trees.c
@@ -2753,8 +2753,8 @@ tsk_treeseq_divergence(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
 }
 
 static int
-relatedness_summary_func(size_t state_dim, const double *state, 
-    size_t result_dim, double *result, void *params)
+relatedness_summary_func(size_t state_dim, const double *state, size_t result_dim,
+    double *result, void *params)
 {
     sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
     const double *x = state;
@@ -2778,9 +2778,9 @@ relatedness_summary_func(size_t state_dim, const double *state,
 
 int
 tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
-    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples,
-    const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result,
-    tsk_flags_t options)
+    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
+    tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows,
+    const double *windows, double *result, tsk_flags_t options)
 {
     int ret = 0;
     ret = check_sample_stat_inputs(num_sample_sets, 2, num_index_tuples, index_tuples);
@@ -2795,7 +2795,7 @@ tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
 }
 
 static int
-Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim, 
+Y2_summary_func(size_t TSK_UNUSED(state_dim), const double *state, size_t result_dim,
     double *result, void *params)
 {
     sample_count_stat_params_t args = *(sample_count_stat_params_t *) params;
diff --git a/c/tskit/trees.h b/c/tskit/trees.h
index 779eeebbd5..762d8afcda 100644
--- a/c/tskit/trees.h
+++ b/c/tskit/trees.h
@@ -356,9 +356,9 @@ int tsk_treeseq_f2(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
     tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows,
     const double *windows, double *result, tsk_flags_t options);
 int tsk_treeseq_relatedness(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,
-    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets, tsk_size_t num_index_tuples,
-    const tsk_id_t *index_tuples, tsk_size_t num_windows, const double *windows, double *result,
-    tsk_flags_t options);
+    const tsk_size_t *sample_set_sizes, const tsk_id_t *sample_sets,
+    tsk_size_t num_index_tuples, const tsk_id_t *index_tuples, tsk_size_t num_windows,
+    const double *windows, double *result, tsk_flags_t options);
 
 /* Three way sample set stats */
 int tsk_treeseq_Y3(const tsk_treeseq_t *self, tsk_size_t num_sample_sets,

From f250616d10c3a08f8430fbf4d2d8d526bac65c03 Mon Sep 17 00:00:00 2001
From: brieuc-mac <brieuc.lehmann@bdi.ox.ac.uk>
Date: Tue, 3 Nov 2020 20:28:47 +0000
Subject: [PATCH 9/9] Add c test

---
 c/tests/test_stats.c | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/c/tests/test_stats.c b/c/tests/test_stats.c
index 89138efcea..b6cd6d770c 100644
--- a/c/tests/test_stats.c
+++ b/c/tests/test_stats.c
@@ -1333,6 +1333,25 @@ test_paper_ex_divergence(void)
     tsk_treeseq_free(&ts);
 }
 
+static void
+test_paper_ex_relatedness(void)
+{
+    tsk_treeseq_t ts;
+    tsk_id_t samples[] = { 0, 1, 2, 3 };
+    tsk_size_t sample_set_sizes[] = { 2, 2 };
+    tsk_id_t set_indexes[] = { 0, 0 };
+    double result;
+    int ret;
+
+    tsk_treeseq_from_text(&ts, 10, paper_ex_nodes, paper_ex_edges, NULL, paper_ex_sites,
+        paper_ex_mutations, paper_ex_individuals, NULL, 0);
+
+    ret = tsk_treeseq_relatedness(&ts, 2, sample_set_sizes, samples, 1, set_indexes, 0,
+        NULL, &result, TSK_STAT_SITE);
+    CU_ASSERT_EQUAL_FATAL(ret, 0);
+    tsk_treeseq_free(&ts);
+}
+
 static void
 test_paper_ex_relatedness_errors(void)
 {
@@ -1691,6 +1710,7 @@ main(int argc, char **argv)
         { "test_paper_ex_divergence_errors", test_paper_ex_divergence_errors },
         { "test_paper_ex_divergence", test_paper_ex_divergence },
         { "test_paper_ex_relatedness_errors", test_paper_ex_relatedness_errors },
+        { "test_paper_ex_relatedness", test_paper_ex_relatedness },
         { "test_paper_ex_Y2_errors", test_paper_ex_Y2_errors },
         { "test_paper_ex_Y2", test_paper_ex_Y2 },
         { "test_paper_ex_f2_errors", test_paper_ex_f2_errors },