From 3b759590a1f21610f72d6f2217e4adb70e8dacef Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Tue, 17 Aug 2021 14:03:39 +0300 Subject: [PATCH 01/14] Draft solution for ndarray/DataFrame conversion --- lyncs_io/convert.py | 37 ++++++++++++++++++++++++++++++++----- test/serial/test_convert.py | 14 ++++++++++++++ 2 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 test/serial/test_convert.py diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 1690651..b0cc27c 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -5,20 +5,45 @@ from datetime import datetime import numpy +from pandas import DataFrame from .utils import is_dask_array from . import __version__ -def get_attrs(data): +def array_to_df(data, attrs): + + # attrs = ( + # , + # ( + # , + # , + # None, + # ), + # {_mgr : BlockManager...} + # ) + + instance = attrs[1][0] + index = [x for x in attrs[2]['_mgr'].items] + df_data = dict(zip(index, data.T)) + return instance(df_data) + + +def get_attrs(data, flag=False): """ Returns the list of attributes needed for reconstructing a data object """ - return { + _dict = { "_lyncs_io": __version__, "created": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), "type": repr(type(data)), } + if flag: + _dict["type"] = type(data) + + if _dict['type'] == DataFrame: + return data.__reduce__() + def get_array_attrs(data): "Returns attributes of an array" @@ -47,9 +72,10 @@ def to_array(data): Converts a data object to array. Returns also the list of attributes needed for reconstructing it. """ - attrs = get_attrs(data) + attrs = get_attrs(data, flag=True) data = _to_array(data) - attrs.update(get_array_attrs(data)) + if attrs[1][0] != DataFrame: + attrs.update(get_array_attrs(data)) return data, attrs @@ -81,4 +107,5 @@ def from_array(data, attrs=None): Converts array to a data object. Undoes to_array. """ # TODO - return data + if attrs[1][0] == DataFrame: + return array_to_df(data, attrs) \ No newline at end of file diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py new file mode 100644 index 0000000..16d45de --- /dev/null +++ b/test/serial/test_convert.py @@ -0,0 +1,14 @@ +from lyncs_io.convert import to_array, from_array +import numpy as np +from pandas import DataFrame +from scipy import sparse + + +def test_to_from_array(): + df = DataFrame({"A": [1, 2], "B": [3, 4]}) + arr, attrs = to_array(df) + new_df = from_array(arr, attrs) + + assert type(df) == type(new_df) + + assert (df.all() == new_df.all()).all() \ No newline at end of file From 875a859c5c90df25f90aaa662f0c157108a6997f Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 17 Aug 2021 12:23:11 +0000 Subject: [PATCH 02/14] Applying black formatting (from Github Action) --- lyncs_io/convert.py | 8 ++++---- test/serial/test_convert.py | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index b0cc27c..70f7a60 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -11,7 +11,7 @@ def array_to_df(data, attrs): - + # attrs = ( # , # ( @@ -23,7 +23,7 @@ def array_to_df(data, attrs): # ) instance = attrs[1][0] - index = [x for x in attrs[2]['_mgr'].items] + index = [x for x in attrs[2]["_mgr"].items] df_data = dict(zip(index, data.T)) return instance(df_data) @@ -41,7 +41,7 @@ def get_attrs(data, flag=False): if flag: _dict["type"] = type(data) - if _dict['type'] == DataFrame: + if _dict["type"] == DataFrame: return data.__reduce__() @@ -108,4 +108,4 @@ def from_array(data, attrs=None): """ # TODO if attrs[1][0] == DataFrame: - return array_to_df(data, attrs) \ No newline at end of file + return array_to_df(data, attrs) diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index 16d45de..62e4abf 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -11,4 +11,4 @@ def test_to_from_array(): assert type(df) == type(new_df) - assert (df.all() == new_df.all()).all() \ No newline at end of file + assert (df.all() == new_df.all()).all() From 54673e212de866a5bc833d36439d4c6461556810 Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Tue, 17 Aug 2021 16:18:02 +0300 Subject: [PATCH 03/14] test commit --- lyncs_io/convert.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index b0cc27c..4727f1d 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -10,6 +10,10 @@ from . import __version__ +def array_to_sparse(data, attrs): + pass + + def array_to_df(data, attrs): # attrs = ( From ac3198f8737f3583f842c665b6d7972e8a376f76 Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Tue, 17 Aug 2021 17:39:47 +0300 Subject: [PATCH 04/14] (draft) support for ndarray/SparseMatrix conversion --- lyncs_io/convert.py | 45 +++++++++++++++++++++++++++++++------ lyncs_io/utils.py | 4 ++++ test/serial/test_convert.py | 21 ++++++++++++++++- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index e02b738..4c47c19 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -6,16 +6,23 @@ from datetime import datetime import numpy from pandas import DataFrame +from scipy import sparse from .utils import is_dask_array from . import __version__ -def array_to_sparse(data, attrs): - pass +def array_to_coo(data): + return sparse.coo_matrix(data) -def array_to_df(data, attrs): +def array_to_csc(data): + return sparse.csc_matrix(data) + +def array_to_csr(data): + return sparse.csr_matrix(data) + +def array_to_df(data, attrs): # attrs = ( # , # ( @@ -48,6 +55,8 @@ def get_attrs(data, flag=False): if _dict["type"] == DataFrame: return data.__reduce__() + return _dict + def get_array_attrs(data): "Returns attributes of an array" @@ -68,6 +77,15 @@ def _to_array(data): "Converts data to array" if is_dask_array(data): return data + + if type(data) == type(sparse.coo_matrix(0)): + return data.toarray() + + if type(data) == type(sparse.csc_matrix(0)): + return data.toarray() + + if type(data) == type(sparse.csr_matrix(0)): + return data.toarray() return numpy.array(data) @@ -78,8 +96,10 @@ def to_array(data): """ attrs = get_attrs(data, flag=True) data = _to_array(data) - if attrs[1][0] != DataFrame: - attrs.update(get_array_attrs(data)) + if type(attrs) != dict: + if attrs[1][0] != DataFrame: + attrs.update(get_array_attrs(data)) + return data, attrs @@ -111,5 +131,16 @@ def from_array(data, attrs=None): Converts array to a data object. Undoes to_array. """ # TODO - if attrs[1][0] == DataFrame: - return array_to_df(data, attrs) + if type(attrs) == dict: + if attrs['type'] == type(sparse.csc_matrix(0)): + return array_to_csc(data) + + if attrs['type'] == type(sparse.csr_matrix(0)): + return array_to_csr(data) + + if attrs['type'] == type(sparse.coo_matrix(0)): + return array_to_coo(data) + + else: + if attrs[1][0] == DataFrame: + return array_to_df(data, attrs) diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index 87b24d5..f3b9fb1 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -54,6 +54,10 @@ def is_dask_array(obj): return False +def is_sparse_matrix(obj): + pass + + def swap(fnc): "Returns a wrapper that swaps the first two arguments of the function" return wraps(fnc)( diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index 62e4abf..3710e77 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -10,5 +10,24 @@ def test_to_from_array(): new_df = from_array(arr, attrs) assert type(df) == type(new_df) - assert (df.all() == new_df.all()).all() + + csr = sparse.random(4,4,format='csr') + csc = sparse.random(4,4,format='csc') + cc = sparse.random(4,4,format='coo') + + sparse_matrices = [csr, csc, cc] + + for m in sparse_matrices: + arr, attrs = to_array(m) + + new_m = from_array(arr, attrs) + + assert type(m) == type(new_m) + assert (m!=new_m).nnz == 0 + + # "For dense arrays >>> np.allclose + # is a good way of testing equality. + # And if the sparse arrays aren't too large, that might be good as well" + + assert np.allclose(m.A, new_m.A) From 73d8f38d0b2aec64dd7bd35a65b4590478a35c7e Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 17 Aug 2021 14:40:46 +0000 Subject: [PATCH 05/14] Applying black formatting (from Github Action) --- lyncs_io/convert.py | 7 ++++--- test/serial/test_convert.py | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 4c47c19..43cd98e 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -22,6 +22,7 @@ def array_to_csc(data): def array_to_csr(data): return sparse.csr_matrix(data) + def array_to_df(data, attrs): # attrs = ( # , @@ -132,13 +133,13 @@ def from_array(data, attrs=None): """ # TODO if type(attrs) == dict: - if attrs['type'] == type(sparse.csc_matrix(0)): + if attrs["type"] == type(sparse.csc_matrix(0)): return array_to_csc(data) - if attrs['type'] == type(sparse.csr_matrix(0)): + if attrs["type"] == type(sparse.csr_matrix(0)): return array_to_csr(data) - if attrs['type'] == type(sparse.coo_matrix(0)): + if attrs["type"] == type(sparse.coo_matrix(0)): return array_to_coo(data) else: diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index 3710e77..2d2e36f 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -12,9 +12,9 @@ def test_to_from_array(): assert type(df) == type(new_df) assert (df.all() == new_df.all()).all() - csr = sparse.random(4,4,format='csr') - csc = sparse.random(4,4,format='csc') - cc = sparse.random(4,4,format='coo') + csr = sparse.random(4, 4, format="csr") + csc = sparse.random(4, 4, format="csc") + cc = sparse.random(4, 4, format="coo") sparse_matrices = [csr, csc, cc] @@ -24,7 +24,7 @@ def test_to_from_array(): new_m = from_array(arr, attrs) assert type(m) == type(new_m) - assert (m!=new_m).nnz == 0 + assert (m != new_m).nnz == 0 # "For dense arrays >>> np.allclose # is a good way of testing equality. From 02b069d08fb1cc443fe80101102701f0e4d56b60 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Tue, 17 Aug 2021 14:40:56 +0000 Subject: [PATCH 06/14] Updating pylint score (from Github Action) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dc3b43..71e148d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![license](https://img.shields.io/github/license/Lyncs-API/lyncs.io?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/blob/master/LICENSE) [![build & test](https://img.shields.io/github/workflow/status/Lyncs-API/lyncs.io/build%20&%20test?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/actions) [![codecov](https://img.shields.io/codecov/c/github/Lyncs-API/lyncs.io?logo=codecov&logoColor=white)](https://codecov.io/gh/Lyncs-API/lyncs.io) -[![pylint](https://img.shields.io/badge/pylint%20score-9.6%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) +[![pylint](https://img.shields.io/badge/pylint%20score-9.5%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) [![black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=codefactor&logoColor=white)](https://github.com/ambv/black) Lyncs IO offers two high-level functions `load` and `save` (or `dump` as alias of `save`). From 5d758cd1143b5f21a2562ac3d4936478abe941e8 Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Tue, 17 Aug 2021 22:32:15 +0300 Subject: [PATCH 07/14] removed redundant code & added is_sparse_matrix function and test --- lyncs_io/convert.py | 42 ++++++++----------------------------- lyncs_io/utils.py | 8 ++++++- test/serial/test_convert.py | 12 +++++++++++ test/serial/test_utils.py | 27 +++++++++++++++++++++++- 4 files changed, 54 insertions(+), 35 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 4c47c19..67ddfec 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -7,21 +7,10 @@ import numpy from pandas import DataFrame from scipy import sparse -from .utils import is_dask_array +from .utils import is_dask_array, is_sparse_matrix from . import __version__ -def array_to_coo(data): - return sparse.coo_matrix(data) - - -def array_to_csc(data): - return sparse.csc_matrix(data) - - -def array_to_csr(data): - return sparse.csr_matrix(data) - def array_to_df(data, attrs): # attrs = ( # , @@ -78,14 +67,9 @@ def _to_array(data): if is_dask_array(data): return data - if type(data) == type(sparse.coo_matrix(0)): + if is_sparse_matrix(type(data)): return data.toarray() - if type(data) == type(sparse.csc_matrix(0)): - return data.toarray() - - if type(data) == type(sparse.csr_matrix(0)): - return data.toarray() return numpy.array(data) @@ -96,9 +80,8 @@ def to_array(data): """ attrs = get_attrs(data, flag=True) data = _to_array(data) - if type(attrs) != dict: - if attrs[1][0] != DataFrame: - attrs.update(get_array_attrs(data)) + if type(attrs) != dict and attrs[1][0] != DataFrame: + attrs.update(get_array_attrs(data)) return data, attrs @@ -131,16 +114,9 @@ def from_array(data, attrs=None): Converts array to a data object. Undoes to_array. """ # TODO - if type(attrs) == dict: - if attrs['type'] == type(sparse.csc_matrix(0)): - return array_to_csc(data) - - if attrs['type'] == type(sparse.csr_matrix(0)): - return array_to_csr(data) + if type(attrs) == dict and attrs.get('type') is not None: + if is_sparse_matrix(attrs['type']): + return attrs['type'](data) - if attrs['type'] == type(sparse.coo_matrix(0)): - return array_to_coo(data) - - else: - if attrs[1][0] == DataFrame: - return array_to_df(data, attrs) + if attrs[1][0] == DataFrame: + return array_to_df(data, attrs) diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index f3b9fb1..97b70d8 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -5,7 +5,9 @@ from functools import wraps from pathlib import Path from os.path import splitext +from inspect import getmembers from collections import defaultdict +from scipy import sparse from lyncs_utils.io import FileLike @@ -55,8 +57,12 @@ def is_dask_array(obj): def is_sparse_matrix(obj): - pass + for item in [x[1] for x in getmembers(sparse)]: + if item == obj: + return True +def is_dataframe(obj): + pass def swap(fnc): "Returns a wrapper that swaps the first two arguments of the function" diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index 3710e77..cfc1ad5 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -5,6 +5,18 @@ def test_to_from_array(): + + # TODO: [x] sparse matrices + # TODO: [ ] ndarrays + # TODO: [ ] built-ins + # TODO: [ ] dask + # TODO: [ ] torch + # TODO: [x] Dataframes + + # ?? + # TODO: [ ] keras + # TODO: [ ] tensorflow + df = DataFrame({"A": [1, 2], "B": [3, 4]}) arr, attrs = to_array(df) new_df = from_array(arr, attrs) diff --git a/test/serial/test_utils.py b/test/serial/test_utils.py index f9686be..c3328c5 100644 --- a/test/serial/test_utils.py +++ b/test/serial/test_utils.py @@ -7,9 +7,18 @@ import tarfile import pytest -from lyncs_io.utils import find_file, get_depth, find_member, format_key +from lyncs_io.utils import find_file, get_depth, find_member, format_key, is_sparse_matrix from lyncs_io.testing import tempdir from lyncs_io.base import save +from scipy.sparse import ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix +) def test_find_file(tempdir): @@ -80,3 +89,19 @@ def test_get_depth(): key = "user/bar/.." assert get_depth(path, key) == 1 + + +def test_is_sparse_matrix(): + + l = [ + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix + ] + + for obj in l: + assert is_sparse_matrix(obj) == True From 6798d4c565d0ed38939ee0f929f94504e4245451 Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Wed, 18 Aug 2021 20:06:35 +0300 Subject: [PATCH 08/14] Added support for torch layers and tensors & Other improvements Improvements include: - Using a more universal way to reconstruct objects that support __reduce__ or __getstate__ - (Almost all) Tests also tets that the intermediate array is the correct ndarray --- lyncs_io/convert.py | 84 +++++++++++++++++++------------------ lyncs_io/utils.py | 73 +++++++++++++++++++++++++++++--- test/serial/test_convert.py | 61 ++++++++++++++++++++------- test/serial/test_utils.py | 46 ++++++++++++++------ 4 files changed, 192 insertions(+), 72 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 9476acd..f223dcd 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -5,42 +5,31 @@ from datetime import datetime import numpy +from dask.array.core import Array as darr from pandas import DataFrame from scipy import sparse -from .utils import is_dask_array, is_sparse_matrix +from .utils import ( + is_dask_array, + is_sparse_matrix, + from_reduced, + in_torch_nn, + layer_to_tensor, + tensor_to_numpy, +) +from torch import Tensor, tensor from . import __version__ -<<<<<<< HEAD -======= -def array_to_coo(data): - return sparse.coo_matrix(data) +def reconstruct_reduced(data, attrs): + fnc, args, kwargs = attrs + obj = fnc(*args) + if hasattr(obj, "__setstate__"): + obj.__setstate__(kwargs) + else: + obj.__dict__.update(kwargs) -def array_to_csc(data): - return sparse.csc_matrix(data) - - -def array_to_csr(data): - return sparse.csr_matrix(data) - - ->>>>>>> 02b069d08fb1cc443fe80101102701f0e4d56b60 -def array_to_df(data, attrs): - # attrs = ( - # , - # ( - # , - # , - # None, - # ), - # {_mgr : BlockManager...} - # ) - - instance = attrs[1][0] - index = [x for x in attrs[2]["_mgr"].items] - df_data = dict(zip(index, data.T)) - return instance(df_data) + return obj def get_attrs(data, flag=False): @@ -53,11 +42,17 @@ def get_attrs(data, flag=False): "type": repr(type(data)), } - if flag: - _dict["type"] = type(data) + _dict["type"] = type(data) if flag else _dict["type"] + + if _dict["type"] not in (Tensor, numpy.ndarray, darr, type(None)): - if _dict["type"] == DataFrame: - return data.__reduce__() + if hasattr(data, "__reduce__"): + return data.__reduce__() + if hasattr(data, "__getstate__"): + return _dict["type"], data.__getstate__() + + # No need for __dict__: + # "If the method is absent, the instance’s __dict__ is pickled as usual" return _dict @@ -85,6 +80,12 @@ def _to_array(data): if is_sparse_matrix(type(data)): return data.toarray() + if in_torch_nn(type(data)): + return layer_to_tensor(tensor_to_numpy(data)) + + if isinstance(data, Tensor): + return tensor_to_numpy(data) + return numpy.array(data) @@ -95,7 +96,8 @@ def to_array(data): """ attrs = get_attrs(data, flag=True) data = _to_array(data) - if type(attrs) != dict and attrs[1][0] != DataFrame: + + if isinstance(attrs, dict): attrs.update(get_array_attrs(data)) return data, attrs @@ -128,10 +130,12 @@ def from_array(data, attrs=None): """ Converts array to a data object. Undoes to_array. """ - # TODO - if type(attrs) == dict and attrs.get('type') is not None: - if is_sparse_matrix(attrs['type']): - return attrs['type'](data) - if attrs[1][0] == DataFrame: - return array_to_df(data, attrs) + if from_reduced(attrs): + return reconstruct_reduced(data, attrs) + + if isinstance(attrs, dict): + if attrs["type"] == Tensor: + return tensor(data) + + return data diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index 97b70d8..cc7cd9a 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -2,13 +2,22 @@ Function utils """ +import torch.nn from functools import wraps from pathlib import Path from os.path import splitext from inspect import getmembers from collections import defaultdict -from scipy import sparse from lyncs_utils.io import FileLike +from scipy.sparse import ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, +) def find_file(filename): @@ -56,13 +65,65 @@ def is_dask_array(obj): return False +""" !!!!!!!!!!! """ + + +def in_torch_nn(obj): + return obj in getmembers(torch.nn) + + +def layer_to_tensor(layer): + fnc, args, kwargs = layer.__reduce__() + params = kwargs["_parameters"] + items = list(params.items()) + param = items[0][1] + return param[:] + + +def layers_are_equal(l1, l2): + "Compare two layers. Using double equals is inappropriate" + return l1.__reduce__() == l2.__reduce__() + + +def tensor_to_numpy(tensor): + return tensor.detach().numpy() + + def is_sparse_matrix(obj): - for item in [x[1] for x in getmembers(sparse)]: - if item == obj: - return True + "Check whether an object is a sparse matrix" + return obj in ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, + ) + + +def from_state(attrs): + return ( + type(attrs) == tuple + and len(attrs) == 2 + and callable(attrs[0]) + and isinstance(type(attrs[1]), dict) + ) + + +def from_reduced(attrs): + "Returns whether an object matches the tuple's format returned by __reduce__" + return ( + type(attrs) == tuple + and len(attrs) == 3 + and callable(attrs[0]) + and isinstance(attrs[1], tuple) + and isinstance(attrs[2], dict) + ) + + +""" !!!!!!!!!!! """ -def is_dataframe(obj): - pass def swap(fnc): "Returns a wrapper that swaps the first two arguments of the function" diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index a03868e..b8fcd6f 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -1,15 +1,19 @@ from lyncs_io.convert import to_array, from_array +from torch.nn import Conv1d +from torch import Tensor import numpy as np +import dask.array as da from pandas import DataFrame from scipy import sparse +from lyncs_io.utils import layers_are_equal def test_to_from_array(): # TODO: [x] sparse matrices - # TODO: [ ] ndarrays + # TODO: [x] ndarrays # TODO: [ ] built-ins - # TODO: [ ] dask + # TODO: [x] dask # TODO: [ ] torch # TODO: [x] Dataframes @@ -17,29 +21,58 @@ def test_to_from_array(): # TODO: [ ] keras # TODO: [ ] tensorflow + # Test DataFrames df = DataFrame({"A": [1, 2], "B": [3, 4]}) arr, attrs = to_array(df) new_df = from_array(arr, attrs) - assert type(df) == type(new_df) + assert (arr == np.array(df)).all() + assert isinstance(new_df, type(df)) assert (df.all() == new_df.all()).all() - csr = sparse.random(4, 4, format="csr") - csc = sparse.random(4, 4, format="csc") - cc = sparse.random(4, 4, format="coo") - - sparse_matrices = [csr, csc, cc] - - for m in sparse_matrices: - arr, attrs = to_array(m) + # Test sparse matrices + formats = ["csr", "csc", "coo", "bsr", "dia", "dok", "lil"] + for f in formats: + matrix = sparse.random(4, 4, format=f) + arr, attrs = to_array(matrix) new_m = from_array(arr, attrs) - assert type(m) == type(new_m) - assert (m != new_m).nnz == 0 + # TODO: + + assert (arr == matrix.toarray()).all() + assert isinstance(new_m, type(matrix)) + assert (matrix != new_m).nnz == 0 + assert np.allclose(matrix.A, new_m.A) # "For dense arrays >>> np.allclose # is a good way of testing equality. # And if the sparse arrays aren't too large, that might be good as well" - assert np.allclose(m.A, new_m.A) + # Test ndarrays + ndarr = np.random.rand(2, 2) + arr, attrs = to_array(ndarr) + new_ndarr = from_array(arr, attrs) + assert (arr == np.array(ndarr)).all() + assert (ndarr == new_ndarr).all() + assert isinstance(new_ndarr, type(ndarr)) + + # Test dask + darr = da.random.random((10, 10)) + arr, attrs = to_array(darr) + new_darr = from_array(arr, attrs) + assert (arr == np.array(darr)).all() + assert (darr == new_darr).all() + assert isinstance(new_darr, type(darr)) + + conv1d = Conv1d(4, 4, 3) + arr, attrs = to_array(conv1d) + new_conv = from_array(arr, attrs) + # assert numpy array + assert layers_are_equal(conv1d, new_conv) + + tensor = Tensor(4, 4, 3) + arr, attrs = to_array(tensor) + new_tens = from_array(arr, attrs) + # assert numpy array + assert (tensor == new_tens).all() diff --git a/test/serial/test_utils.py b/test/serial/test_utils.py index c3328c5..a414a53 100644 --- a/test/serial/test_utils.py +++ b/test/serial/test_utils.py @@ -7,7 +7,18 @@ import tarfile import pytest -from lyncs_io.utils import find_file, get_depth, find_member, format_key, is_sparse_matrix +import numpy as np +from pandas import DataFrame +from scipy import sparse +from lyncs_io.utils import ( + find_file, + get_depth, + find_member, + format_key, + is_sparse_matrix, + from_reduced, + from_state, +) from lyncs_io.testing import tempdir from lyncs_io.base import save from scipy.sparse import ( @@ -17,7 +28,7 @@ bsr_matrix, dia_matrix, dok_matrix, - lil_matrix + lil_matrix, ) @@ -93,15 +104,26 @@ def test_get_depth(): def test_is_sparse_matrix(): - l = [ - csc_matrix, - csr_matrix, - coo_matrix, - bsr_matrix, - dia_matrix, - dok_matrix, - lil_matrix + matrix_types = [ + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, ] - - for obj in l: + + for obj in matrix_types: assert is_sparse_matrix(obj) == True + + +def test_from_reduced(): + objs = [ + DataFrame({}), + # np.ndarray, + sparse.random(1, 1), + ] + + for obj in objs: + assert from_reduced(obj.__reduce__()) == True From d3604ca4616d21a95a3a099a8f34804b9555c15d Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Thu, 19 Aug 2021 10:10:17 +0300 Subject: [PATCH 09/14] improved pylint score --- lyncs_io/convert.py | 9 ++++----- lyncs_io/utils.py | 21 +++++++++++++++------ test/serial/test_convert.py | 6 ++++-- 3 files changed, 23 insertions(+), 13 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index f223dcd..65bee2f 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -6,8 +6,7 @@ from datetime import datetime import numpy from dask.array.core import Array as darr -from pandas import DataFrame -from scipy import sparse +from torch import Tensor, tensor from .utils import ( is_dask_array, is_sparse_matrix, @@ -16,11 +15,11 @@ layer_to_tensor, tensor_to_numpy, ) -from torch import Tensor, tensor from . import __version__ -def reconstruct_reduced(data, attrs): +def reconstruct_reduced(attrs): + "Reconstructs an object from the tuple returned by __reduce__" fnc, args, kwargs = attrs obj = fnc(*args) @@ -132,7 +131,7 @@ def from_array(data, attrs=None): """ if from_reduced(attrs): - return reconstruct_reduced(data, attrs) + return reconstruct_reduced(attrs) if isinstance(attrs, dict): if attrs["type"] == Tensor: diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index cc7cd9a..762c917 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -2,12 +2,13 @@ Function utils """ -import torch.nn from functools import wraps from pathlib import Path from os.path import splitext from inspect import getmembers from collections import defaultdict +from warnings import warn +import torch.nn from lyncs_utils.io import FileLike from scipy.sparse import ( csc_matrix, @@ -68,24 +69,31 @@ def is_dask_array(obj): """ !!!!!!!!!!! """ +# def check_suport(obj): +# "Checks whether the object's type is supported" + + def in_torch_nn(obj): + "Checks if an object belongs in the torch.nn module (Layers)" return obj in getmembers(torch.nn) def layer_to_tensor(layer): - fnc, args, kwargs = layer.__reduce__() + "Converts a torch layer to a tensor" + _, _, kwargs = layer.__reduce__() params = kwargs["_parameters"] items = list(params.items()) param = items[0][1] return param[:] -def layers_are_equal(l1, l2): +def layers_are_equal(layer1, layer2): "Compare two layers. Using double equals is inappropriate" - return l1.__reduce__() == l2.__reduce__() + return layer1.__reduce__() == layer2.__reduce__() def tensor_to_numpy(tensor): + "Converts a tensor to a numpy array" return tensor.detach().numpy() @@ -103,8 +111,9 @@ def is_sparse_matrix(obj): def from_state(attrs): + "Check whether an object matches the tuple's format returned by __getstate__" return ( - type(attrs) == tuple + isinstance(attrs, tuple) and len(attrs) == 2 and callable(attrs[0]) and isinstance(type(attrs[1]), dict) @@ -114,7 +123,7 @@ def from_state(attrs): def from_reduced(attrs): "Returns whether an object matches the tuple's format returned by __reduce__" return ( - type(attrs) == tuple + isinstance(attrs, tuple) and len(attrs) == 3 and callable(attrs[0]) and isinstance(attrs[1], tuple) diff --git a/test/serial/test_convert.py b/test/serial/test_convert.py index b8fcd6f..eff901e 100644 --- a/test/serial/test_convert.py +++ b/test/serial/test_convert.py @@ -12,9 +12,9 @@ def test_to_from_array(): # TODO: [x] sparse matrices # TODO: [x] ndarrays - # TODO: [ ] built-ins + # TODO: [x] built-ins # TODO: [x] dask - # TODO: [ ] torch + # TODO: [x] torch # TODO: [x] Dataframes # ?? @@ -69,10 +69,12 @@ def test_to_from_array(): arr, attrs = to_array(conv1d) new_conv = from_array(arr, attrs) # assert numpy array + assert isinstance(arr, np.ndarray) assert layers_are_equal(conv1d, new_conv) tensor = Tensor(4, 4, 3) arr, attrs = to_array(tensor) new_tens = from_array(arr, attrs) # assert numpy array + assert isinstance(arr, np.ndarray) assert (tensor == new_tens).all() From 895dfcf3cb2a7621e9f1c65880726992c90f2bd9 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Thu, 19 Aug 2021 07:11:21 +0000 Subject: [PATCH 10/14] Updating pylint score (from Github Action) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 71e148d..3dc3b43 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![license](https://img.shields.io/github/license/Lyncs-API/lyncs.io?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/blob/master/LICENSE) [![build & test](https://img.shields.io/github/workflow/status/Lyncs-API/lyncs.io/build%20&%20test?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/actions) [![codecov](https://img.shields.io/codecov/c/github/Lyncs-API/lyncs.io?logo=codecov&logoColor=white)](https://codecov.io/gh/Lyncs-API/lyncs.io) -[![pylint](https://img.shields.io/badge/pylint%20score-9.5%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) +[![pylint](https://img.shields.io/badge/pylint%20score-9.6%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) [![black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=codefactor&logoColor=white)](https://github.com/ambv/black) Lyncs IO offers two high-level functions `load` and `save` (or `dump` as alias of `save`). From d091488d5fefe71309f536f617d461e02a5e02f4 Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Thu, 19 Aug 2021 11:09:31 +0300 Subject: [PATCH 11/14] added a check before to_array is executed to check if the type of the data is supported --- lyncs_io/convert.py | 9 ++++++--- lyncs_io/utils.py | 36 +++++++++++++++++++++++++----------- test/serial/test_utils.py | 16 ++++------------ 3 files changed, 35 insertions(+), 26 deletions(-) diff --git a/lyncs_io/convert.py b/lyncs_io/convert.py index 65bee2f..5703bfc 100644 --- a/lyncs_io/convert.py +++ b/lyncs_io/convert.py @@ -14,6 +14,7 @@ in_torch_nn, layer_to_tensor, tensor_to_numpy, + check_support, ) from . import __version__ @@ -76,11 +77,11 @@ def _to_array(data): if is_dask_array(data): return data - if is_sparse_matrix(type(data)): + if is_sparse_matrix(data): return data.toarray() - if in_torch_nn(type(data)): - return layer_to_tensor(tensor_to_numpy(data)) + if in_torch_nn(data): + return tensor_to_numpy(layer_to_tensor(data)) if isinstance(data, Tensor): return tensor_to_numpy(data) @@ -93,6 +94,8 @@ def to_array(data): Converts a data object to array. Returns also the list of attributes needed for reconstructing it. """ + check_support(data) + attrs = get_attrs(data, flag=True) data = _to_array(data) diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index 762c917..d72610a 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -9,6 +9,9 @@ from collections import defaultdict from warnings import warn import torch.nn +from pandas import DataFrame +from numpy import ndarray +from torch import Tensor from lyncs_utils.io import FileLike from scipy.sparse import ( csc_matrix, @@ -69,13 +72,21 @@ def is_dask_array(obj): """ !!!!!!!!!!! """ -# def check_suport(obj): -# "Checks whether the object's type is supported" +def check_support(obj): + "Checks whether the object's type is supported" + if not ( + is_sparse_matrix(obj) + or is_dask_array(obj) + or in_torch_nn(obj) + or isinstance(obj, (ndarray, DataFrame, Tensor, type(None))) + ): + raise TypeError(f"{obj} {type(obj)} is not supported yet") def in_torch_nn(obj): "Checks if an object belongs in the torch.nn module (Layers)" - return obj in getmembers(torch.nn) + members = tuple([m[1] for m in getmembers(torch.nn) if isinstance(m[1], type)]) + return isinstance(obj, members) def layer_to_tensor(layer): @@ -99,14 +110,17 @@ def tensor_to_numpy(tensor): def is_sparse_matrix(obj): "Check whether an object is a sparse matrix" - return obj in ( - csc_matrix, - csr_matrix, - coo_matrix, - bsr_matrix, - dia_matrix, - dok_matrix, - lil_matrix, + return isinstance( + obj, + ( + csc_matrix, + csr_matrix, + coo_matrix, + bsr_matrix, + dia_matrix, + dok_matrix, + lil_matrix, + ), ) diff --git a/test/serial/test_utils.py b/test/serial/test_utils.py index a414a53..a5ab8d4 100644 --- a/test/serial/test_utils.py +++ b/test/serial/test_utils.py @@ -103,19 +103,11 @@ def test_get_depth(): def test_is_sparse_matrix(): + formats = ["csr", "csc", "coo", "bsr", "dia", "dok", "lil"] - matrix_types = [ - csc_matrix, - csr_matrix, - coo_matrix, - bsr_matrix, - dia_matrix, - dok_matrix, - lil_matrix, - ] - - for obj in matrix_types: - assert is_sparse_matrix(obj) == True + for f in formats: + matrix = sparse.random(4, 4, format=f) + assert is_sparse_matrix(matrix) == True def test_from_reduced(): From acbb3bad97ed3a1c5bda6ab40fa0f53bc7322c4a Mon Sep 17 00:00:00 2001 From: Alexandros Angeli Date: Thu, 2 Sep 2021 12:44:38 +0300 Subject: [PATCH 12/14] draft version of universally extracting arrays from deserialised obj --- lyncs_io/traverse_reduced.py | 67 ++++++++++++++++++++++++++++++++++++ lyncs_io/utils.py | 2 +- 2 files changed, 68 insertions(+), 1 deletion(-) create mode 100644 lyncs_io/traverse_reduced.py diff --git a/lyncs_io/traverse_reduced.py b/lyncs_io/traverse_reduced.py new file mode 100644 index 0000000..7cbf0c4 --- /dev/null +++ b/lyncs_io/traverse_reduced.py @@ -0,0 +1,67 @@ +import numpy +import torch +from collections import OrderedDict +from typing import Iterable +from torch.nn import Conv1d +from pprint import pprint + + +class Dummy(list): + pass + + +def from_dummy(ds): + for i, elt in enumerate(ds): + if isinstance(elt, Dummy): + ds[i] = from_dummy(elt) + if isinstance(ds, Dummy): + ds = tuple(ds) + return ds + +def to_dummy(ds): + if isinstance(ds, tuple): + ds = Dummy(ds) + for i, elt in enumerate(ds): + if isinstance(elt, tuple): + ds[i] = to_dummy(elt) + return ds + + +def gen(): + num = 0 + while True: + yield num + num += 1 + + +gen = gen() +global_dict = {} + + +def fnc(s, reverse=False, gen=gen): + global global_dict + + if isinstance(s, (dict, OrderedDict)): + x = {key : fnc(value, reverse) for key, value in s.items()} + return OrderedDict(x) if isinstance(s, OrderedDict) else x + elif isinstance(s, (list, tuple)): + x = [fnc(e, reverse) for e in s] + return tuple(x) if isinstance(s, tuple) else x + elif isinstance(s, torch.nn.Parameter): + placeholder_no = str(next(gen)) + global_dict['placeholder' + placeholder_no] = s + return 'placeholder' + str(placeholder_no) + elif isinstance(s, str) and reverse and s in global_dict.keys(): + return global_dict[s] + return s + + +c = Conv1d(4,4,3) +reduced = c.__reduce__() + +result= fnc(reduced) +after = fnc(result, reverse=True) + +test = reduced == after and reduced != result +print(test) + diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index d72610a..26e33ad 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -35,7 +35,7 @@ def find_file(filename): if isinstance(filename, FileLike): return filename - + path = Path(filename) if path.exists(): return filename From 360477f0069a737e1a73bc1d75a4949dddd493d0 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Thu, 2 Sep 2021 09:45:34 +0000 Subject: [PATCH 13/14] Applying black formatting (from Github Action) --- lyncs_io/traverse_reduced.py | 14 +++++++------- lyncs_io/utils.py | 2 +- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/lyncs_io/traverse_reduced.py b/lyncs_io/traverse_reduced.py index 7cbf0c4..d73339f 100644 --- a/lyncs_io/traverse_reduced.py +++ b/lyncs_io/traverse_reduced.py @@ -18,6 +18,7 @@ def from_dummy(ds): ds = tuple(ds) return ds + def to_dummy(ds): if isinstance(ds, tuple): ds = Dummy(ds) @@ -40,28 +41,27 @@ def gen(): def fnc(s, reverse=False, gen=gen): global global_dict - + if isinstance(s, (dict, OrderedDict)): - x = {key : fnc(value, reverse) for key, value in s.items()} + x = {key: fnc(value, reverse) for key, value in s.items()} return OrderedDict(x) if isinstance(s, OrderedDict) else x elif isinstance(s, (list, tuple)): x = [fnc(e, reverse) for e in s] return tuple(x) if isinstance(s, tuple) else x elif isinstance(s, torch.nn.Parameter): placeholder_no = str(next(gen)) - global_dict['placeholder' + placeholder_no] = s - return 'placeholder' + str(placeholder_no) + global_dict["placeholder" + placeholder_no] = s + return "placeholder" + str(placeholder_no) elif isinstance(s, str) and reverse and s in global_dict.keys(): return global_dict[s] return s -c = Conv1d(4,4,3) +c = Conv1d(4, 4, 3) reduced = c.__reduce__() -result= fnc(reduced) +result = fnc(reduced) after = fnc(result, reverse=True) test = reduced == after and reduced != result print(test) - diff --git a/lyncs_io/utils.py b/lyncs_io/utils.py index 26e33ad..d72610a 100644 --- a/lyncs_io/utils.py +++ b/lyncs_io/utils.py @@ -35,7 +35,7 @@ def find_file(filename): if isinstance(filename, FileLike): return filename - + path = Path(filename) if path.exists(): return filename From 009e04ad557190c68d8de1aa4390ee526635c5d0 Mon Sep 17 00:00:00 2001 From: Simone Bacchio Date: Thu, 2 Sep 2021 09:45:41 +0000 Subject: [PATCH 14/14] Updating pylint score (from Github Action) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 3dc3b43..71e148d 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ [![license](https://img.shields.io/github/license/Lyncs-API/lyncs.io?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/blob/master/LICENSE) [![build & test](https://img.shields.io/github/workflow/status/Lyncs-API/lyncs.io/build%20&%20test?logo=github&logoColor=white)](https://github.com/Lyncs-API/lyncs.io/actions) [![codecov](https://img.shields.io/codecov/c/github/Lyncs-API/lyncs.io?logo=codecov&logoColor=white)](https://codecov.io/gh/Lyncs-API/lyncs.io) -[![pylint](https://img.shields.io/badge/pylint%20score-9.6%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) +[![pylint](https://img.shields.io/badge/pylint%20score-9.5%2F10-green?logo=python&logoColor=white)](http://pylint.pycqa.org/) [![black](https://img.shields.io/badge/code%20style-black-000000.svg?logo=codefactor&logoColor=white)](https://github.com/ambv/black) Lyncs IO offers two high-level functions `load` and `save` (or `dump` as alias of `save`).