tskit-dev · mergify · Aug 4, 2021 · Aug 1, 2021 · Aug 4, 2021 · benjeffery
diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c
@@ -6428,6 +6428,46 @@ TableCollection_load(TableCollection *self, PyObject *args, PyObject *kwds)
     return ret;
 }
 
+static PyObject *
+TableCollection_asdict(TableCollection *self, PyObject *args, PyObject *kwds)
+{
+    PyObject *ret = NULL;
+    int force_offset_64 = 0;
+    static char *kwlist[] = { "force_offset_64", NULL };
+
+    if (TableCollection_check_state(self) != 0) {
+        goto out;
+    }
+    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i", kwlist, &force_offset_64)) {
+        goto out;
+    }
+    /* Use the LWT tables code */
+    ret = dump_tables_dict(self->tables, force_offset_64);
+out:
+    return ret;
+}
+
+static PyObject *
+TableCollection_fromdict(TableCollection *self, PyObject *args)
+{
+    PyObject *ret = NULL;
+    PyObject *dict = NULL;
+
+    if (TableCollection_check_state(self) != 0) {
+        goto out;
+    }
+    if (!PyArg_ParseTuple(args, "O!", &PyDict_Type, &dict)) {
+        goto out;
+    }
+    /* Use the LWT tables code */
+    if (parse_table_collection_dict(self->tables, dict) != 0) {
+        goto out;
+    }
+    ret = Py_BuildValue("");
+out:
+    return ret;
+}
+
 static PyGetSetDef TableCollection_getsetters[] = {
     { .name = "individuals",
         .get = (getter) TableCollection_get_individuals,
@@ -6548,6 +6588,14 @@ static PyMethodDef TableCollection_methods[] = {
         .ml_meth = (PyCFunction) TableCollection_load,
         .ml_flags = METH_VARARGS | METH_KEYWORDS,
         .ml_doc = "Loads the table collection out to the specified file." },
+    { .ml_name = "asdict",
+        .ml_meth = (PyCFunction) TableCollection_asdict,
+        .ml_flags = METH_VARARGS | METH_KEYWORDS,
+        .ml_doc = "Returns the table collection in dictionary encoding. " },
+    { .ml_name = "fromdict",
+        .ml_meth = (PyCFunction) TableCollection_fromdict,
+        .ml_flags = METH_VARARGS,
+        .ml_doc = "Sets the state of this table collection from the specified dict" },
     { NULL } /* Sentinel */
 };
 

diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py
@@ -408,6 +408,37 @@ def test_equals_bad_args(self):
         with pytest.raises(TypeError):
             tc.equals(tc, ignore_timestamps=bad_bool)
 
+    def test_asdict(self):
+        for ts in self.get_example_tree_sequences():
+            tc = _tskit.TableCollection(sequence_length=ts.get_sequence_length())
+            ts.dump_tables(tc)
+            d = tc.asdict()
+            # Method is tested extensively elsewhere, just basic sanity check here
+            assert isinstance(d, dict)
+            assert len(d) > 0
+
+    def test_fromdict(self):
+        for ts in self.get_example_tree_sequences():
+            tc1 = _tskit.TableCollection(sequence_length=ts.get_sequence_length())
+            ts.dump_tables(tc1)
+            d = tc1.asdict()
+            tc2 = _tskit.TableCollection(sequence_length=0)
+            tc2.fromdict(d)
+            assert tc1.equals(tc2)
+
+    def test_asdict_bad_args(self):
+        ts = msprime.simulate(10, random_seed=1242)
+        tc = ts.tables._ll_tables
+        for bad_type in [None, 0.1, "str"]:
+            with pytest.raises(TypeError):
+                tc.asdict(force_offset_64=bad_type)
+
+    def test_fromdict_bad_args(self):
+        tc = _tskit.TableCollection(0)
+        for bad_type in [None, 0.1, "str"]:
+            with pytest.raises(TypeError):
+                tc.fromdict(bad_type)
+
 
 class TestTableMethods:
     """

diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py
@@ -3129,31 +3129,6 @@ class TestTableCollection:
     Tests for the convenience wrapper around a collection of related tables.
     """
 
-    def add_metadata(self, tc):
-        tc.metadata_schema = tskit.MetadataSchema(
-            {
-                "codec": "struct",
-                "type": "object",
-                "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}},
-            }
-        )
-        tc.metadata = {"top-level": "top-level-metadata"}
-        for table in tskit.TABLE_NAMES:
-            t = getattr(tc, table)
-            if hasattr(t, "metadata_schema"):
-                t.packset_metadata(
-                    [f"{table}-{i:10}".encode() for i in range(t.num_rows)]
-                )
-                t.metadata_schema = tskit.MetadataSchema(
-                    {
-                        "codec": "struct",
-                        "type": "object",
-                        "properties": {
-                            table: {"type": "string", "binaryFormat": "16p"}
-                        },
-                    }
-                )
-
     def test_table_references(self):
         ts = msprime.simulate(10, mutation_rate=2, random_seed=1)
         tables = ts.tables
@@ -3229,6 +3204,26 @@ def test_asdict(self, ts_fixture):
         assert t1.has_index()
         assert t2.has_index()
 
+    @pytest.mark.parametrize("force_offset_64", [True, False])
+    def test_asdict_force_offset_64(self, ts_fixture, force_offset_64):
+        tables = ts_fixture.dump_tables()
+        d = tables.asdict(force_offset_64=force_offset_64)
+        for table in tables.name_map:
+            for name, column in d[table].items():
+                if name.endswith("_offset"):
+                    if force_offset_64:
+                        assert column.dtype == np.uint64
+                    else:
+                        assert column.dtype == np.uint32
+
+    def test_asdict_force_offset_64_default(self, ts_fixture):
+        tables = ts_fixture.dump_tables()
+        d = tables.asdict()
+        for table in tables.name_map:
+            for name, column in d[table].items():
+                if name.endswith("_offset"):
+                    assert column.dtype == np.uint32
+
     def test_asdict_lifecycle(self, ts_fixture):
         tables = ts_fixture.dump_tables()
         tables_dict = tables.asdict()
@@ -3945,13 +3940,35 @@ def test_bad_metadata(self):
         assert tc._ll_tables.metadata == b""
 
 
-class TestTableCollectionPickle(TestTableCollection):
+def add_table_collection_metadata(tc):
+    tc.metadata_schema = tskit.MetadataSchema(
+        {
+            "codec": "struct",
+            "type": "object",
+            "properties": {"top-level": {"type": "string", "binaryFormat": "50p"}},
+        }
+    )
+    tc.metadata = {"top-level": "top-level-metadata"}
+    for table in tskit.TABLE_NAMES:
+        t = getattr(tc, table)
+        if hasattr(t, "metadata_schema"):
+            t.packset_metadata([f"{table}-{i:10}".encode() for i in range(t.num_rows)])
+            t.metadata_schema = tskit.MetadataSchema(
+                {
+                    "codec": "struct",
+                    "type": "object",
+                    "properties": {table: {"type": "string", "binaryFormat": "16p"}},
+                }
+            )
+
+
+class TestTableCollectionPickle:
     """
     Tests that we can round-trip table collections through pickle.
     """
 
     def verify(self, tables):
-        self.add_metadata(tables)
+        add_table_collection_metadata(tables)
         other_tables = pickle.loads(pickle.dumps(tables))
         tables.assert_equals(other_tables)
 

diff --git a/python/tskit/tables.py b/python/tskit/tables.py
@@ -1,6 +1,6 @@
 # MIT License
 #
-# Copyright (c) 2018-2020 Tskit Developers
+# Copyright (c) 2018-2021 Tskit Developers
 # Copyright (c) 2017 University of Oxford
 #
 # Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -2461,29 +2461,22 @@ def metadata_bytes(self) -> Any:
         """
         return self._ll_tables.metadata
 
-    def asdict(self):
+    def asdict(self, force_offset_64=False):
         """
-        Returns a dictionary representation of this TableCollection.
+        Returns the nested dictionary representation of this TableCollection
+        used for interchange.
 
         Note: the semantics of this method changed at tskit 0.1.0. Previously a
         map of table names to the tables themselves was returned.
+
+        :param bool force_offset_64: If True, all offset columns will have dtype
+            np.uint64. If False (the default) the offset array columns will have
+            a dtype of either np.uint32 or np.uint64, depending on the size of the
+            corresponding data array.
+        :return: The dictionary representation of this table collection.
+        :rtype: dict
         """
-        ret = {
-            "encoding_version": (1, 3),
-            "sequence_length": self.sequence_length,
-            "metadata_schema": repr(self.metadata_schema),
-            "metadata": self.metadata_schema.encode_row(self.metadata),
-            "individuals": self.individuals.asdict(),
-            "nodes": self.nodes.asdict(),
-            "edges": self.edges.asdict(),
-            "migrations": self.migrations.asdict(),
-            "sites": self.sites.asdict(),
-            "mutations": self.mutations.asdict(),
-            "populations": self.populations.asdict(),
-            "provenances": self.provenances.asdict(),
-            "indexes": self.indexes.asdict(),
-        }
-        return ret
+        return self._ll_tables.asdict(force_offset_64)
 
     @property
     def name_map(self):
@@ -2701,45 +2694,13 @@ def dump(self, file_or_path):
 
     # Unpickle support
     def __setstate__(self, state):
-        self.__init__(state["sequence_length"])
-        self.metadata_schema = tskit.parse_metadata_schema(state["metadata_schema"])
-        self.metadata = self.metadata_schema.decode_row(state["metadata"])
-        self.individuals.set_columns(**state["individuals"])
-        self.nodes.set_columns(**state["nodes"])
-        self.edges.set_columns(**state["edges"])
-        self.migrations.set_columns(**state["migrations"])
-        self.sites.set_columns(**state["sites"])
-        self.mutations.set_columns(**state["mutations"])
-        self.populations.set_columns(**state["populations"])
-        self.provenances.set_columns(**state["provenances"])
+        self.__init__()
+        self._ll_tables.fromdict(state)
 
     @classmethod
     def fromdict(self, tables_dict):
-        tables = TableCollection(tables_dict["sequence_length"])
-        try:
-            tables.metadata_schema = tskit.parse_metadata_schema(
-                tables_dict["metadata_schema"]
-            )
-        except KeyError:
-            pass
-        try:
-            tables.metadata = tables.metadata_schema.decode_row(tables_dict["metadata"])
-        except KeyError:
-            pass
-        tables.individuals.set_columns(**tables_dict["individuals"])
-        tables.nodes.set_columns(**tables_dict["nodes"])
-        tables.edges.set_columns(**tables_dict["edges"])
-        tables.migrations.set_columns(**tables_dict["migrations"])
-        tables.sites.set_columns(**tables_dict["sites"])
-        tables.mutations.set_columns(**tables_dict["mutations"])
-        tables.populations.set_columns(**tables_dict["populations"])
-        tables.provenances.set_columns(**tables_dict["provenances"])
-
-        # Indexes must be last as other wise the check for their consistency will fail
-        try:
-            tables.indexes = TableCollectionIndexes(**tables_dict["indexes"])
-        except KeyError:
-            pass
+        tables = TableCollection()
+        tables._ll_tables.fromdict(tables_dict)
         return tables
 
     def copy(self):