nmdp-bioinformatics · Ofekirsh · Sep 17, 2025
diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md
@@ -3,6 +3,6 @@ Description of Pull Request..
 ## Checklist
 
 - [ ] Unit tests
-- [ ] Documentation 
+- [ ] Documentation
 
 Fixes #
diff --git a/grma/.DS_Store b/grma/.DS_Store
diff --git a/grma/__init__.py b/grma/__init__.py
@@ -1,2 +1,2 @@
 from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
-from grma.match import matching, find_matches
+from grma.match.match import matching, find_matches
diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py
@@ -10,9 +10,8 @@
 from grma.donorsgraph.create_lol import LolBuilder
 from grma.match.graph_wrapper import Graph
 from grma.utilities.geno_representation import HashableArray
-from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
-
-CLASS_I_END = 6
+from grma.utilities.utils import print_time, geno_to_int, gl_string_to_hash
+from bidict import bidict
 
 
 class BuildMatchingGraph:
@@ -21,7 +20,7 @@ class BuildMatchingGraph:
     It gets a path to directory with the donors' file, builds the graph and saved it as LOL graph using Cython.
     """
 
-    __slots__ = "_verbose", "_graph", "_edges"
+    __slots__ = "_verbose", "_graph", "_edges", "bidirectional_dict"
 
     def __init__(self, path_to_donors_directory: str, verbose: bool = False):
         """
@@ -33,19 +32,20 @@ def __init__(self, path_to_donors_directory: str, verbose: bool = False):
         self._verbose = verbose
         self._graph = None  # LOL dict-representation
         self._edges: List[Edge] = []  # edge-list
+        self.bidirectional_dict = bidict()
         self._save_graph_as_edges(path_to_donors_directory)
 
     def _create_classes_edges(self, geno, class_, layers):
-        int_class = tuple_geno_to_int(class_)
+        hash_class = gl_string_to_hash(str(class_)) % 1000000000 + 1000000000
 
-        self._edges.append(Edge(int_class, geno, 0))
+        self._edges.append(Edge(hash_class, geno, 0))
 
         # check if the class node was created
-        if int_class not in layers["CLASS"]:
-            layers["CLASS"].add(int_class)
-            self._create_subclass_edges(class_, int_class, layers)
+        if hash_class not in layers["CLASS"]:
+            layers["CLASS"].add(hash_class)
+            self._create_subclass_edges(class_, hash_class, layers)
 
-    def _create_subclass_edges(self, class_, int_class, layers):
+    def _create_subclass_edges(self, class_, hash_class, layers):
         """
         subclasses edges are created by dropping an allele from a class.
         each allele we drop, will be replaced with zero,
@@ -58,19 +58,18 @@ def _create_subclass_edges(self, class_, int_class, layers):
         # set the missing allele to always be the second allele in the locus
         for i in range(num_of_alleles):
             if i % 2 == 0:
-                subclass_alleles.add(
-                    tuple_geno_to_int(tuple(class_[0:i] + (0,) + class_[i + 1 :]))
-                )
+                subclass = tuple(class_[0:i] + (0,) + class_[i + 1 :])
             else:
-                subclass_alleles.add(
-                    tuple_geno_to_int(
-                        tuple(class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :])
-                    )
+                subclass = tuple(
+                    class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :]
                 )
 
+            hash_subclass = gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000
+            subclass_alleles.add(hash_subclass)
+
         # add subclass->class edges
         for sub in subclass_alleles:
-            self._edges.append(Edge(sub, int_class, 0))
+            self._edges.append(Edge(sub, hash_class, 0))
             if sub not in layers["SUBCLASS"]:
                 layers["SUBCLASS"].add(sub)
 
@@ -103,16 +102,27 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                 ):
                     # retrieve all line's parameters
                     donor_id, geno, probability, index = line.strip().split(",")
-                    donor_id = int(donor_id)
+                    donor_id = -1 * int(donor_id)
                     index = int(index)
                     probability = float(probability)
 
                     # convert geno to list of integers
-                    geno = gl_string_to_integers(geno)
-
-                    # sort alleles for each HLA-X
-                    for x in range(0, 10, 2):
-                        geno[x : x + 2] = sorted(geno[x : x + 2])
+                    alleles = [
+                        allele
+                        for locus in geno.split("^")
+                        for allele in locus.split("+")
+                    ]
+                    geno = []
+                    for allele in alleles:
+                        if "N" in allele:
+                            geno.append(0)
+                        elif allele in self.bidirectional_dict:
+                            geno.append(self.bidirectional_dict[allele])
+                        else:
+                            self.bidirectional_dict[allele] = (
+                                len(self.bidirectional_dict) + 3
+                            )
+                            geno.append(self.bidirectional_dict[allele])
                     geno = HashableArray(geno)
 
                     # handle new donor appearance in file
@@ -137,6 +147,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                     # continue creation of classes and subclasses
                     if geno not in layers["GENOTYPE"]:
                         layers["GENOTYPE"].add(geno)
+                        CLASS_I_END = -2 * int(-len(geno) / 4 - 0.5)
                         geno_class1 = tuple(geno[:CLASS_I_END])
                         geno_class2 = tuple(geno[CLASS_I_END:])
                         self._create_classes_edges(geno, geno_class1, layers)
@@ -177,4 +188,5 @@ def to_pickle(self, path: Union[str, os.PathLike]):
 
         :param path: A path to save the pickled object
         """
-        pickle.dump(self._graph, open(path, "wb"))
+        graph_bdict = [self._graph, self.bidirectional_dict]
+        pickle.dump(graph_bdict, open(path, "wb"))
diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py
@@ -6,7 +6,7 @@
 from collections import OrderedDict
 
 from grma.donorsgraph import Edge
-from grma.utilities.utils import print_time, tuple_geno_to_int
+from grma.utilities.utils import print_time
 
 
 class LolBuilder:
@@ -76,16 +76,18 @@ def _convert(self, layers: Dict[str, Set]):
         arrays_start = free
         # map lol-ids to arrays
         # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
+        geno = layers["GENOTYPE"].pop()
+        layers["GENOTYPE"].add(geno)
         map_number_to_arr_node = np.zeros(
-            (len(layers["GENOTYPE"]), 10), dtype=np.uint16
+            (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint32
         )
         for i, geno in tqdm(
             enumerate(layers["GENOTYPE"]),
             desc="(1.3) Map nodes to internal numbers",
             disable=not self._verbose,
         ):
             map_node_to_number[geno] = free
-            map_number_to_arr_node[i, :] = geno.np()
+            map_number_to_arr_node[i] = geno
             free += 1
 
         # map classes to lol-id.
@@ -110,7 +112,7 @@ def _convert(self, layers: Dict[str, Set]):
                 )
                 if y < subclasses_start
             ],
-            dtype=np.uint32,
+            dtype=np.int32,
         )
 
         print_time("(3/6) Create the index list")
@@ -184,12 +186,6 @@ def _convert(self, layers: Dict[str, Set]):
                     weights_list=weights_list,
                 )
 
-        # replace geno hashable array to more efficient representation.
-        for array_geno in layers["GENOTYPE"]:
-            int_geno = tuple_geno_to_int(array_geno)
-            map_node_to_number[int_geno] = map_node_to_number[array_geno]
-            del map_node_to_number[array_geno]
-
         del self._graph
         del layers
         gc.collect()