Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/PULL_REQUEST_TEMPLATE/pull_request_template.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Description of Pull Request..
## Checklist

- [ ] Unit tests
- [ ] Documentation
- [ ] Documentation

Fixes #
Binary file added grma/.DS_Store
Binary file not shown.
2 changes: 1 addition & 1 deletion grma/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
from grma.match import matching, find_matches
from grma.match.match import matching, find_matches
62 changes: 37 additions & 25 deletions grma/donorsgraph/build_donors_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
from grma.donorsgraph.create_lol import LolBuilder
from grma.match.graph_wrapper import Graph
from grma.utilities.geno_representation import HashableArray
from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time

CLASS_I_END = 6
from grma.utilities.utils import print_time, geno_to_int, gl_string_to_hash
from bidict import bidict


class BuildMatchingGraph:
Expand All @@ -21,7 +20,7 @@ class BuildMatchingGraph:
It gets a path to directory with the donors' file, builds the graph and saved it as LOL graph using Cython.
"""

__slots__ = "_verbose", "_graph", "_edges"
__slots__ = "_verbose", "_graph", "_edges", "bidirectional_dict"

def __init__(self, path_to_donors_directory: str, verbose: bool = False):
"""
Expand All @@ -33,19 +32,20 @@ def __init__(self, path_to_donors_directory: str, verbose: bool = False):
self._verbose = verbose
self._graph = None # LOL dict-representation
self._edges: List[Edge] = [] # edge-list
self.bidirectional_dict = bidict()
self._save_graph_as_edges(path_to_donors_directory)

def _create_classes_edges(self, geno, class_, layers):
int_class = tuple_geno_to_int(class_)
hash_class = gl_string_to_hash(str(class_)) % 1000000000 + 1000000000

self._edges.append(Edge(int_class, geno, 0))
self._edges.append(Edge(hash_class, geno, 0))

# check if the class node was created
if int_class not in layers["CLASS"]:
layers["CLASS"].add(int_class)
self._create_subclass_edges(class_, int_class, layers)
if hash_class not in layers["CLASS"]:
layers["CLASS"].add(hash_class)
self._create_subclass_edges(class_, hash_class, layers)

def _create_subclass_edges(self, class_, int_class, layers):
def _create_subclass_edges(self, class_, hash_class, layers):
"""
subclasses edges are created by dropping an allele from a class.
each allele we drop, will be replaced with zero,
Expand All @@ -58,19 +58,18 @@ def _create_subclass_edges(self, class_, int_class, layers):
# set the missing allele to always be the second allele in the locus
for i in range(num_of_alleles):
if i % 2 == 0:
subclass_alleles.add(
tuple_geno_to_int(tuple(class_[0:i] + (0,) + class_[i + 1 :]))
)
subclass = tuple(class_[0:i] + (0,) + class_[i + 1 :])
else:
subclass_alleles.add(
tuple_geno_to_int(
tuple(class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :])
)
subclass = tuple(
class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :]
)

hash_subclass = gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000
subclass_alleles.add(hash_subclass)

# add subclass->class edges
for sub in subclass_alleles:
self._edges.append(Edge(sub, int_class, 0))
self._edges.append(Edge(sub, hash_class, 0))
if sub not in layers["SUBCLASS"]:
layers["SUBCLASS"].add(sub)

Expand Down Expand Up @@ -103,16 +102,27 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
):
# retrieve all line's parameters
donor_id, geno, probability, index = line.strip().split(",")
donor_id = int(donor_id)
donor_id = -1 * int(donor_id)
index = int(index)
probability = float(probability)

# convert geno to list of integers
geno = gl_string_to_integers(geno)

# sort alleles for each HLA-X
for x in range(0, 10, 2):
geno[x : x + 2] = sorted(geno[x : x + 2])
alleles = [
allele
for locus in geno.split("^")
for allele in locus.split("+")
]
geno = []
for allele in alleles:
if "N" in allele:
geno.append(0)
elif allele in self.bidirectional_dict:
geno.append(self.bidirectional_dict[allele])
else:
self.bidirectional_dict[allele] = (
len(self.bidirectional_dict) + 3
)
geno.append(self.bidirectional_dict[allele])
geno = HashableArray(geno)

# handle new donor appearance in file
Expand All @@ -137,6 +147,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
# continue creation of classes and subclasses
if geno not in layers["GENOTYPE"]:
layers["GENOTYPE"].add(geno)
CLASS_I_END = -2 * int(-len(geno) / 4 - 0.5)
geno_class1 = tuple(geno[:CLASS_I_END])
geno_class2 = tuple(geno[CLASS_I_END:])
self._create_classes_edges(geno, geno_class1, layers)
Expand Down Expand Up @@ -177,4 +188,5 @@ def to_pickle(self, path: Union[str, os.PathLike]):

:param path: A path to save the pickled object
"""
pickle.dump(self._graph, open(path, "wb"))
graph_bdict = [self._graph, self.bidirectional_dict]
pickle.dump(graph_bdict, open(path, "wb"))
16 changes: 6 additions & 10 deletions grma/donorsgraph/create_lol.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections import OrderedDict

from grma.donorsgraph import Edge
from grma.utilities.utils import print_time, tuple_geno_to_int
from grma.utilities.utils import print_time


class LolBuilder:
Expand Down Expand Up @@ -76,16 +76,18 @@ def _convert(self, layers: Dict[str, Set]):
arrays_start = free
# map lol-ids to arrays
# given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
geno = layers["GENOTYPE"].pop()
layers["GENOTYPE"].add(geno)
map_number_to_arr_node = np.zeros(
(len(layers["GENOTYPE"]), 10), dtype=np.uint16
(len(layers["GENOTYPE"]), len(geno)), dtype=np.uint32
)
for i, geno in tqdm(
enumerate(layers["GENOTYPE"]),
desc="(1.3) Map nodes to internal numbers",
disable=not self._verbose,
):
map_node_to_number[geno] = free
map_number_to_arr_node[i, :] = geno.np()
map_number_to_arr_node[i] = geno
free += 1

# map classes to lol-id.
Expand All @@ -110,7 +112,7 @@ def _convert(self, layers: Dict[str, Set]):
)
if y < subclasses_start
],
dtype=np.uint32,
dtype=np.int32,
)

print_time("(3/6) Create the index list")
Expand Down Expand Up @@ -184,12 +186,6 @@ def _convert(self, layers: Dict[str, Set]):
weights_list=weights_list,
)

# replace geno hashable array to more efficient representation.
for array_geno in layers["GENOTYPE"]:
int_geno = tuple_geno_to_int(array_geno)
map_node_to_number[int_geno] = map_node_to_number[array_geno]
del map_node_to_number[array_geno]

del self._graph
del layers
gc.collect()
Expand Down
Loading
You are viewing a condensed version of this merge commit. You can view the full changes here.