diff --git a/README.md b/README.md index 3dc3b43..71e148d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![license](https://img.shields.io/github/license/Lyncs-API/lyncs.io?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/blob/master/LICENSE) [![build & test](https://img.shields.io/github/workflow/status/Lyncs-API/lyncs.io/build%20&%20test?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/actions) [![codecov](https://img.shields.io/codecov/c/github/Lyncs-API/lyncs.io?logo=codecov&logoColor=white)](https://codecov.io/gh/Lyncs-API/lyncs.io) -[![pylint](https://img.shields.io/badge/pylint%20score-9.6%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) +[![pylint](https://img.shields.io/badge/pylint%20score-9.5%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) [![black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=codefactor&logoColor=white)](https://github.com/ambv/black) Lyncs IO offers two high-level functions `load` and `save` (or `dump` as alias of `save`). diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 1690651..5703bfc 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -5,20 +5,57 @@ from datetime import datetime import numpy -from .utils import is_dask_array +from dask.array.core import Array as darr +from torch import Tensor, tensor +from .utils import ( + is_dask_array, + is_sparse_matrix, + from_reduced, + in_torch_nn, + layer_to_tensor, + tensor_to_numpy, + check_support, +) from . import __version__ -def get_attrs(data): +def reconstruct_reduced(attrs): + "Reconstructs an object from the tuple returned by __reduce__" + fnc, args, kwargs = attrs + obj = fnc(*args) + + if hasattr(obj, "__setstate__"): + obj.__setstate__(kwargs) + else: + obj.__dict__.update(kwargs) + + return obj + + +def get_attrs(data, flag=False): """ Returns the list of attributes needed for reconstructing a data object """ - return { + _dict = { "_lyncs_io": __version__, "created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "type": repr(type(data)), } + _dict["type"] = type(data) if flag else _dict["type"] + + if _dict["type"] not in (Tensor, numpy.ndarray, darr, type(None)): + + if hasattr(data, "__reduce__"): + return data.__reduce__() + if hasattr(data, "__getstate__"): + return _dict["type"], data.__getstate__() + + # No need for __dict__: + # "If the method is absent, the instance’s __dict__ is pickled as usual" + + return _dict + def get_array_attrs(data): "Returns attributes of an array" @@ -39,6 +76,16 @@ def _to_array(data): "Converts data to array" if is_dask_array(data): return data + + if is_sparse_matrix(data): + return data.toarray() + + if in_torch_nn(data): + return tensor_to_numpy(layer_to_tensor(data)) + + if isinstance(data, Tensor): + return tensor_to_numpy(data) + return numpy.array(data) @@ -47,9 +94,14 @@ def to_array(data): Converts a data object to array. Returns also the list of attributes needed for reconstructing it. """ - attrs = get_attrs(data) + check_support(data) + + attrs = get_attrs(data, flag=True) data = _to_array(data) - attrs.update(get_array_attrs(data)) + + if isinstance(attrs, dict): + attrs.update(get_array_attrs(data)) + return data, attrs @@ -80,5 +132,12 @@ def from_array(data, attrs=None): """ Converts array to a data object. Undoes to_array. """ - # TODO + + if from_reduced(attrs): + return reconstruct_reduced(attrs) + + if isinstance(attrs, dict): + if attrs["type"] == Tensor: + return tensor(data) + return data diff --git a/lyncs_io/traverse_reduced.py b/lyncs_io/traverse_reduced.py new file mode 100644 index 0000000..d73339f --- /dev/null +++ b/lyncs_io/traverse_reduced.py @@ -0,0 +1,67 @@ +import numpy +import torch +from collections import OrderedDict +from typing import Iterable +from torch.nn import Conv1d +from pprint import pprint + + +class Dummy(list): + pass + + +def from_dummy(ds): + for i, elt in enumerate(ds): + if isinstance(elt, Dummy): + ds[i] = from_dummy(elt) + if isinstance(ds, Dummy): + ds = tuple(ds) + return ds + + +def to_dummy(ds): + if isinstance(ds, tuple): + ds = Dummy(ds) + for i, elt in enumerate(ds): + if isinstance(elt, tuple): + ds[i] = to_dummy(elt) + return ds + + +def gen(): + num = 0 + while True: + yield num + num += 1 + + +gen = gen() +global_dict = {} + + +def fnc(s, reverse=False, gen=gen): + global global_dict + + if isinstance(s, (dict, OrderedDict)): + x = {key: fnc(value, reverse) for key, value in s.items()} + return OrderedDict(x) if isinstance(s, OrderedDict) else x + elif isinstance(s, (list, tuple)): + x = [fnc(e, reverse) for e in s] + return tuple(x) if isinstance(s, tuple) else x + elif isinstance(s, torch.nn.Parameter): + placeholder_no = str(next(gen)) + global_dict["placeholder" + placeholder_no] = s + return "placeholder" + str(placeholder_no) + elif isinstance(s, str) and reverse and s in global_dict.keys(): + return global_dict[s] + return s + + +c = Conv1d(4, 4, 3) +reduced = c.__reduce__() + +result = fnc(reduced) +after = fnc(result, reverse=True) + +test = reduced == after and reduced != result +print(test) diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index 87b24d5..d72610a 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -5,8 +5,23 @@ from functools import wraps from pathlib import Path from os.path import splitext +from inspect import getmembers from collections import defaultdict +from warnings import warn +import torch.nn +from pandas import DataFrame +from numpy import ndarray +from torch import Tensor from lyncs_utils.io import FileLike +from scipy.sparse import ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, +) def find_file(filename): @@ -54,6 +69,85 @@ def is_dask_array(obj): return False +""" !!!!!!!!!!! """ + + +def check_support(obj): + "Checks whether the object's type is supported" + if not ( + is_sparse_matrix(obj) + or is_dask_array(obj) + or in_torch_nn(obj) + or isinstance(obj, (ndarray, DataFrame, Tensor, type(None))) + ): + raise TypeError(f"{obj} {type(obj)} is not supported yet") + + +def in_torch_nn(obj): + "Checks if an object belongs in the torch.nn module (Layers)" + members = tuple([m[1] for m in getmembers(torch.nn) if isinstance(m[1], type)]) + return isinstance(obj, members) + + +def layer_to_tensor(layer): + "Converts a torch layer to a tensor" + _, _, kwargs = layer.__reduce__() + params = kwargs["_parameters"] + items = list(params.items()) + param = items[0][1] + return param[:] + + +def layers_are_equal(layer1, layer2): + "Compare two layers. Using double equals is inappropriate" + return layer1.__reduce__() == layer2.__reduce__() + + +def tensor_to_numpy(tensor): + "Converts a tensor to a numpy array" + return tensor.detach().numpy() + + +def is_sparse_matrix(obj): + "Check whether an object is a sparse matrix" + return isinstance( + obj, + ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, + ), + ) + + +def from_state(attrs): + "Check whether an object matches the tuple's format returned by __getstate__" + return ( + isinstance(attrs, tuple) + and len(attrs) == 2 + and callable(attrs[0]) + and isinstance(type(attrs[1]), dict) + ) + + +def from_reduced(attrs): + "Returns whether an object matches the tuple's format returned by __reduce__" + return ( + isinstance(attrs, tuple) + and len(attrs) == 3 + and callable(attrs[0]) + and isinstance(attrs[1], tuple) + and isinstance(attrs[2], dict) + ) + + +""" !!!!!!!!!!! """ + + def swap(fnc): "Returns a wrapper that swaps the first two arguments of the function" return wraps(fnc)( diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py new file mode 100644 index 0000000..eff901e --- /dev/null +++ b/test/serial/test_convert.py @@ -0,0 +1,80 @@ +from lyncs_io.convert import to_array, from_array +from torch.nn import Conv1d +from torch import Tensor +import numpy as np +import dask.array as da +from pandas import DataFrame +from scipy import sparse +from lyncs_io.utils import layers_are_equal + + +def test_to_from_array(): + + # TODO: [x] sparse matrices + # TODO: [x] ndarrays + # TODO: [x] built-ins + # TODO: [x] dask + # TODO: [x] torch + # TODO: [x] Dataframes + + # ?? + # TODO: [ ] keras + # TODO: [ ] tensorflow + + # Test DataFrames + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + arr, attrs = to_array(df) + new_df = from_array(arr, attrs) + + assert (arr == np.array(df)).all() + assert isinstance(new_df, type(df)) + assert (df.all() == new_df.all()).all() + + # Test sparse matrices + formats = ["csr", "csc", "coo", "bsr", "dia", "dok", "lil"] + + for f in formats: + matrix = sparse.random(4, 4, format=f) + arr, attrs = to_array(matrix) + new_m = from_array(arr, attrs) + + # TODO: + + assert (arr == matrix.toarray()).all() + assert isinstance(new_m, type(matrix)) + assert (matrix != new_m).nnz == 0 + assert np.allclose(matrix.A, new_m.A) + + # "For dense arrays >>> np.allclose + # is a good way of testing equality. + # And if the sparse arrays aren't too large, that might be good as well" + + # Test ndarrays + ndarr = np.random.rand(2, 2) + arr, attrs = to_array(ndarr) + new_ndarr = from_array(arr, attrs) + assert (arr == np.array(ndarr)).all() + assert (ndarr == new_ndarr).all() + assert isinstance(new_ndarr, type(ndarr)) + + # Test dask + darr = da.random.random((10, 10)) + arr, attrs = to_array(darr) + new_darr = from_array(arr, attrs) + assert (arr == np.array(darr)).all() + assert (darr == new_darr).all() + assert isinstance(new_darr, type(darr)) + + conv1d = Conv1d(4, 4, 3) + arr, attrs = to_array(conv1d) + new_conv = from_array(arr, attrs) + # assert numpy array + assert isinstance(arr, np.ndarray) + assert layers_are_equal(conv1d, new_conv) + + tensor = Tensor(4, 4, 3) + arr, attrs = to_array(tensor) + new_tens = from_array(arr, attrs) + # assert numpy array + assert isinstance(arr, np.ndarray) + assert (tensor == new_tens).all() diff --git a/test/serial/test_utils.py b/test/serial/test_utils.py index f9686be..a5ab8d4 100644 --- a/test/serial/test_utils.py +++ b/test/serial/test_utils.py @@ -7,9 +7,29 @@ import tarfile import pytest -from lyncs_io.utils import find_file, get_depth, find_member, format_key +import numpy as np +from pandas import DataFrame +from scipy import sparse +from lyncs_io.utils import ( + find_file, + get_depth, + find_member, + format_key, + is_sparse_matrix, + from_reduced, + from_state, +) from lyncs_io.testing import tempdir from lyncs_io.base import save +from scipy.sparse import ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, +) def test_find_file(tempdir): @@ -80,3 +100,22 @@ def test_get_depth(): key = "user/bar/.." assert get_depth(path, key) == 1 + + +def test_is_sparse_matrix(): + formats = ["csr", "csc", "coo", "bsr", "dia", "dok", "lil"] + + for f in formats: + matrix = sparse.random(4, 4, format=f) + assert is_sparse_matrix(matrix) == True + + +def test_from_reduced(): + objs = [ + DataFrame({}), + # np.ndarray, + sparse.random(1, 1), + ] + + for obj in objs: + assert from_reduced(obj.__reduce__()) == True