diff --git a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md index 8b10e29..ead050f 100644 --- a/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md +++ b/.github/PULL_REQUEST_TEMPLATE/pull_request_template.md @@ -3,6 +3,6 @@ Description of Pull Request.. ## Checklist - [ ] Unit tests -- [ ] Documentation +- [ ] Documentation Fixes # diff --git a/grma/.DS_Store b/grma/.DS_Store new file mode 100644 index 0000000..ad67c4a Binary files /dev/null and b/grma/.DS_Store differ diff --git a/grma/__init__.py b/grma/__init__.py index d8319e8..2f3da52 100644 --- a/grma/__init__.py +++ b/grma/__init__.py @@ -1,2 +1,2 @@ from grma.donorsgraph.build_donors_graph import BuildMatchingGraph -from grma.match import matching, find_matches +from grma.match.match import matching, find_matches diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py index 303867f..d7f908f 100644 --- a/grma/donorsgraph/build_donors_graph.py +++ b/grma/donorsgraph/build_donors_graph.py @@ -10,9 +10,8 @@ from grma.donorsgraph.create_lol import LolBuilder from grma.match.graph_wrapper import Graph from grma.utilities.geno_representation import HashableArray -from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time - -CLASS_I_END = 6 +from grma.utilities.utils import print_time, geno_to_int, gl_string_to_hash +from bidict import bidict class BuildMatchingGraph: @@ -21,7 +20,7 @@ class BuildMatchingGraph: It gets a path to directory with the donors' file, builds the graph and saved it as LOL graph using Cython. """ - __slots__ = "_verbose", "_graph", "_edges" + __slots__ = "_verbose", "_graph", "_edges", "bidirectional_dict" def __init__(self, path_to_donors_directory: str, verbose: bool = False): """ @@ -33,19 +32,20 @@ def __init__(self, path_to_donors_directory: str, verbose: bool = False): self._verbose = verbose self._graph = None # LOL dict-representation self._edges: List[Edge] = [] # edge-list + self.bidirectional_dict = bidict() self._save_graph_as_edges(path_to_donors_directory) def _create_classes_edges(self, geno, class_, layers): - int_class = tuple_geno_to_int(class_) + hash_class = gl_string_to_hash(str(class_)) % 1000000000 + 1000000000 - self._edges.append(Edge(int_class, geno, 0)) + self._edges.append(Edge(hash_class, geno, 0)) # check if the class node was created - if int_class not in layers["CLASS"]: - layers["CLASS"].add(int_class) - self._create_subclass_edges(class_, int_class, layers) + if hash_class not in layers["CLASS"]: + layers["CLASS"].add(hash_class) + self._create_subclass_edges(class_, hash_class, layers) - def _create_subclass_edges(self, class_, int_class, layers): + def _create_subclass_edges(self, class_, hash_class, layers): """ subclasses edges are created by dropping an allele from a class. each allele we drop, will be replaced with zero, @@ -58,19 +58,18 @@ def _create_subclass_edges(self, class_, int_class, layers): # set the missing allele to always be the second allele in the locus for i in range(num_of_alleles): if i % 2 == 0: - subclass_alleles.add( - tuple_geno_to_int(tuple(class_[0:i] + (0,) + class_[i + 1 :])) - ) + subclass = tuple(class_[0:i] + (0,) + class_[i + 1 :]) else: - subclass_alleles.add( - tuple_geno_to_int( - tuple(class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :]) - ) + subclass = tuple( + class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :] ) + hash_subclass = gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000 + subclass_alleles.add(hash_subclass) + # add subclass->class edges for sub in subclass_alleles: - self._edges.append(Edge(sub, int_class, 0)) + self._edges.append(Edge(sub, hash_class, 0)) if sub not in layers["SUBCLASS"]: layers["SUBCLASS"].add(sub) @@ -103,16 +102,27 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): ): # retrieve all line's parameters donor_id, geno, probability, index = line.strip().split(",") - donor_id = int(donor_id) + donor_id = -1 * int(donor_id) index = int(index) probability = float(probability) # convert geno to list of integers - geno = gl_string_to_integers(geno) - - # sort alleles for each HLA-X - for x in range(0, 10, 2): - geno[x : x + 2] = sorted(geno[x : x + 2]) + alleles = [ + allele + for locus in geno.split("^") + for allele in locus.split("+") + ] + geno = [] + for allele in alleles: + if "N" in allele: + geno.append(0) + elif allele in self.bidirectional_dict: + geno.append(self.bidirectional_dict[allele]) + else: + self.bidirectional_dict[allele] = ( + len(self.bidirectional_dict) + 3 + ) + geno.append(self.bidirectional_dict[allele]) geno = HashableArray(geno) # handle new donor appearance in file @@ -137,6 +147,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): # continue creation of classes and subclasses if geno not in layers["GENOTYPE"]: layers["GENOTYPE"].add(geno) + CLASS_I_END = -2 * int(-len(geno) / 4 - 0.5) geno_class1 = tuple(geno[:CLASS_I_END]) geno_class2 = tuple(geno[CLASS_I_END:]) self._create_classes_edges(geno, geno_class1, layers) @@ -177,4 +188,5 @@ def to_pickle(self, path: Union[str, os.PathLike]): :param path: A path to save the pickled object """ - pickle.dump(self._graph, open(path, "wb")) + graph_bdict = [self._graph, self.bidirectional_dict] + pickle.dump(graph_bdict, open(path, "wb")) diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py index ce861c0..0789a4d 100644 --- a/grma/donorsgraph/create_lol.py +++ b/grma/donorsgraph/create_lol.py @@ -6,7 +6,7 @@ from collections import OrderedDict from grma.donorsgraph import Edge -from grma.utilities.utils import print_time, tuple_geno_to_int +from grma.utilities.utils import print_time class LolBuilder: @@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]): arrays_start = free # map lol-ids to arrays # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :] + geno = layers["GENOTYPE"].pop() + layers["GENOTYPE"].add(geno) map_number_to_arr_node = np.zeros( - (len(layers["GENOTYPE"]), 10), dtype=np.uint16 + (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint32 ) for i, geno in tqdm( enumerate(layers["GENOTYPE"]), @@ -85,7 +87,7 @@ def _convert(self, layers: Dict[str, Set]): disable=not self._verbose, ): map_node_to_number[geno] = free - map_number_to_arr_node[i, :] = geno.np() + map_number_to_arr_node[i] = geno free += 1 # map classes to lol-id. @@ -110,7 +112,7 @@ def _convert(self, layers: Dict[str, Set]): ) if y < subclasses_start ], - dtype=np.uint32, + dtype=np.int32, ) print_time("(3/6) Create the index list") @@ -184,12 +186,6 @@ def _convert(self, layers: Dict[str, Set]): weights_list=weights_list, ) - # replace geno hashable array to more efficient representation. - for array_geno in layers["GENOTYPE"]: - int_geno = tuple_geno_to_int(array_geno) - map_node_to_number[int_geno] = map_node_to_number[array_geno] - del map_node_to_number[array_geno] - del self._graph del layers gc.collect() diff --git a/grma/match/donors_matching.py b/grma/match/donors_matching.py index bbeb06e..c2ce392 100644 --- a/grma/match/donors_matching.py +++ b/grma/match/donors_matching.py @@ -2,6 +2,9 @@ from typing import List, Tuple, Set, Iterable, Dict from typing import Sequence +from ..utilities.utils import geno_to_int, gl_string_to_hash +import ast +from bidict import bidict import networkx as nx import numpy as np import pandas as pd @@ -13,15 +16,11 @@ donor_mismatch_format, drop_less_than_7_matches, check_similarity, - gl_string_to_integers, - tuple_geno_to_int, print_time, ) DONORS_DB: pd.DataFrame = pd.DataFrame() ZEROS: HashableArray = HashableArray([0]) -ALLELES_IN_CLASS_I: int = 6 -ALLELES_IN_CLASS_II: int = 4 def set_database(donors_db: pd.DataFrame = pd.DataFrame()): @@ -39,24 +38,13 @@ def _init_results_df(donors_info): fields_in_results = { "Patient_ID": [], "Donor_ID": [], + "GvH_Mismatches": [], + "HvG_Mismatches": [], "Number_Of_Mismatches": [], "Matching_Probability": [], - "Match_Probability_A_1": [], - "Match_Probability_A_2": [], - "Match_Probability_B_1": [], - "Match_Probability_B_2": [], - "Match_Probability_C_1": [], - "Match_Probability_C_2": [], - "Match_Probability_DQB1_1": [], - "Match_Probability_DQB1_2": [], - "Match_Probability_DRB1_1": [], - "Match_Probability_DRB1_2": [], + "Match_Probability": [], "Permissive/Non-Permissive": [], - "Match_Between_Most_Commons_A": [], - "Match_Between_Most_Commons_B": [], - "Match_Between_Most_Commons_C": [], - "Match_Between_Most_Commons_DQB": [], - "Match_Between_Most_Commons_DRB": [], + "Match_Between_Most_Commons": [], } donors_db_fields = DONORS_DB.columns.values.tolist() @@ -66,17 +54,28 @@ def _init_results_df(donors_info): return pd.DataFrame(fields_in_results) -def locuses_match_between_genos(geno1, geno2): +def locuses_match_between_genos(geno_pat, geno_don): matches = [] - for i in range(5): - a1, b1 = geno1[2 * i], geno1[2 * i + 1] - a2, b2 = geno2[2 * i], geno2[2 * i + 1] + total_gvh = 0 + total_hvg = 0 - s1 = int(a1 == a2) + int(b1 == b2) - s2 = int(a1 == b2) + int(b1 == a2) - matches.append(max(s1, s2)) + for i in range(0, len(geno_pat), 2): + a1, b1 = geno_pat[i], geno_pat[i + 1] + a2, b2 = geno_don[i], geno_don[i + 1] - return matches + P = {x for x in (a1, b1) if x not in (None, 0)} + D = {x for x in (a2, b2) if x not in (None, 0)} + + gvh_i = len(P - D) # patient has, donor lacks + hvg_i = len(D - P) # donor has, patient lacks + + total_gvh += gvh_i + total_hvg += hvg_i + + mismatch_i = max(gvh_i, hvg_i) # table's #Max + matches.append(mismatch_i) + + return matches, total_gvh, total_hvg class DonorsMatching(object): @@ -88,16 +87,18 @@ class DonorsMatching(object): "_genotype_candidates", "patients", "verbose", + "bidirectional_dict", ) - def __init__(self, graph: Graph, verbose: bool = False): + def __init__(self, graph: Graph, verbose: bool = False, bdict: bidict = None): self._graph: Graph = graph self._patients_graph: nx.DiGraph = nx.DiGraph() self._genotype_candidates: Dict[ int, Dict[int, List[Tuple[float, int]]] ] = {} # AMIT ADD - self.patients: Dict[int, Sequence[int]] = {} + self.patients: Dict[int, HashableArray] = {} self.verbose = verbose + self.bidirectional_dict = bdict def get_most_common_genotype(self, donor_id): """Takes a donor ID and return his/her most common genotype.""" @@ -127,7 +128,7 @@ def probability_to_allele( ) -> List[float]: """Takes a donor ID and a genotype. Returns the probability of match for each allele""" - probs = [0 for _ in range(10)] + probs = [0 for _ in range(len(pat_geno))] for i, allele in enumerate(pat_geno): p = 0 @@ -148,7 +149,9 @@ def __find_genotype_candidates_from_class( ) -> Tuple[np.ndarray, np.ndarray]: """Takes an integer subclass. Returns the genotypes (ids and values) which are connected to it in the graph""" - return self._graph.class_neighbors(clss) + return self._graph.class_neighbors( + clss, Len=len(next(iter(self.patients.values()))) + ) def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]: """Gets the LOL ID of a genotype. @@ -214,14 +217,23 @@ def __add_matched_genos_to_graph( weight={geno_num: [probability, similarity]}) """ - def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): + def __classes_and_subclasses_from_genotype(self, genotype): subclasses = [] - classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]] + ALLELES_IN_CLASS_I = -2 * int(-len(genotype) / 4 - 0.5) + ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I + classes = [ + (tuple(genotype[:ALLELES_IN_CLASS_I]), 0), + (tuple(genotype[ALLELES_IN_CLASS_I:]), 1), + ] num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II] - int_classes = [tuple_geno_to_int(tuple(clss)) for clss in classes] + int_classes = [] + for clss in classes: + hash_clss = gl_string_to_hash(str(clss[0])) % 1000000000 + 1000000000 + int_classes.append((hash_clss, clss[1])) + for clss in int_classes: - self._patients_graph.add_edge(clss, genotype) + self._patients_graph.add_edge(clss[0], genotype) # class one is considered as 0. # class two is considered as 1. @@ -230,22 +242,29 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): for k in range(0, num_of_alleles_in_class[class_num]): # set the missing allele to always be the second allele in the locus if k % 2 == 0: - sub = tuple_geno_to_int( - classes[class_num][0:k] + ZEROS + classes[class_num][k + 1 :] + subclass = tuple( + classes[class_num][0][0:k] + + (0,) + + classes[class_num][0][k + 1 :] ) else: - sub = tuple_geno_to_int( - classes[class_num][0 : k - 1] - + ZEROS - + classes[class_num][k - 1 : k] - + classes[class_num][k + 1 :] + subclass = tuple( + classes[class_num][0][0 : k - 1] + + (0,) + + classes[class_num][0][k - 1 : k] + + classes[class_num][0][k + 1 :] ) + hash_subclass = ( + gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000 + ) # missing allele number is the index of the first allele of the locus the missing allele belongs to. # Could be [0, 2, 4, 6, 8] missing_allele_num = ALLELES_IN_CLASS_I * class_num + 2 * (k // 2) subclass = ClassMinusOne( - subclass=sub, class_num=class_num, allele_num=missing_allele_num + subclass=hash_subclass, + class_num=class_num, + allele_num=missing_allele_num, ) # add subclass -> genotype edge to patients graph @@ -273,13 +292,49 @@ def create_patients_graph(self, f_patients: str): # retrieve all line's parameters line_values = line.strip().split(",") patient_id, geno, prob, index = line_values - - geno = gl_string_to_integers(geno) - patient_id = int(patient_id) + patient_id = -1 * int(patient_id) index = int(index) prob = float(prob) - # handle new patient appearance in file + encoded = [] + for locus in geno.split("^"): + parts = locus.split("+") + aL = parts[0].strip() if len(parts) > 0 else "N" + aR = parts[1].strip() if len(parts) > 1 else "N" + + # Classify left allele + if "N" in aL: + tL, vL = "missing", 0 + elif aL in self.bidirectional_dict: + tL, vL = "known", self.bidirectional_dict[aL] + else: + tL, vL = "unknown", None + + # Classify right allele + if "N" in aR: + tR, vR = "missing", 0 + elif aR in self.bidirectional_dict: + tR, vR = "known", self.bidirectional_dict[aR] + else: + tR, vR = "unknown", None + + # Decide per-locus encoding (two outputs per locus, in order) + if tL == "unknown" and tR == "unknown": + if aL == aR: + encoded.extend( + [1, 1] + ) # same unknown twice (homozygous unknown) + else: + encoded.extend([1, 2]) # two different unknowns + elif tL == "unknown" and tR != "unknown": + encoded.extend([1, vR]) # left unknown, right known/missing + elif tL != "unknown" and tR == "unknown": + encoded.extend([vL, 1]) # left known/missing, right unknown + else: + encoded.extend([vL, vR]) # both known/missing + + geno = HashableArray(encoded) + if index == 0: # set normalized probabilities for HLA, probability in prob_dict.items(): @@ -299,12 +354,6 @@ def create_patients_graph(self, f_patients: str): subclasses_by_patient[patient_id] = set() classes_by_patient[patient_id] = set() - # sort alleles for each HLA-X - for x in range(0, 10, 2): - geno[x : x + 2] = sorted(geno[x : x + 2]) - - geno = HashableArray(geno) - # add probabilities to probability dict total_prob += prob if geno not in prob_dict: @@ -351,14 +400,21 @@ def find_geno_candidates_by_subclasses(self, subclasses): genotypes_value, ) = self.__find_genotype_candidates_from_subclass(subclass.subclass) + len_geno = len(genotypes_value[0]) + ALLELES_IN_CLASS_I = -2 * int(-len_geno / 4 - 0.5) + ALLELES_IN_CLASS_II = len_geno - ALLELES_IN_CLASS_I # Checks only the locuses that are not certain to match if subclass.class_num == 0: allele_range_to_check = np.array( - [6, 8, subclass.allele_num], dtype=np.uint8 + [x for x in range(ALLELES_IN_CLASS_I, len_geno, 2)] + + [subclass.allele_num], + dtype=np.uint8, ) else: allele_range_to_check = np.array( - [0, 2, 4, subclass.allele_num], dtype=np.uint8 + [x for x in range(0, ALLELES_IN_CLASS_I, 2)] + + [subclass.allele_num], + dtype=np.uint8, ) # number of alleles that already match due to match in subclass @@ -383,9 +439,9 @@ def find_geno_candidates_by_classes(self, classes): desc="finding classes matching candidates", disable=not self.verbose, ): - if self._graph.in_nodes(clss): + if self._graph.in_nodes(clss[0]): patient_genos = self._patients_graph.neighbors( - clss + clss[0] ) # The patient's genotypes which might be match ( genotypes_ids, @@ -395,12 +451,32 @@ def find_geno_candidates_by_classes(self, classes): # Checks only the locuses that are not certain to match (the locuses of the other class) # Class I appearances: 3 locuses = 6 alleles = 23/24 digits # Class II appearances: 2 locuses = 4 alleles = 15/16 digits - if len(str(clss)) > 20: - allele_range_to_check = np.array([6, 8], dtype=np.uint8) - matched_alleles: int = 6 + num_alleles = len(next(iter(self.patients.values()))) + if clss[1] == 0: + allele_range_to_check = np.array( + [ + x + for x in range( + num_alleles // 2 + (num_alleles // 2 & 1), + num_alleles, + 2, + ) + ], + dtype=np.uint8, + ) + matched_alleles: int = num_alleles // 2 + (num_alleles // 2 & 1) + else: - allele_range_to_check = np.array([0, 2, 4], dtype=np.uint8) - matched_alleles: int = 4 + allele_range_to_check = np.array( + [ + x + for x in range( + 0, num_alleles // 2 + (num_alleles // 2 & 1), 2 + ) + ], + dtype=np.uint8, + ) + matched_alleles: int = num_alleles // 2 - (num_alleles // 2 & 1) # Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph. self.__add_matched_genos_to_graph( @@ -432,8 +508,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int): "probability" ] # patient's geno probability - int_geno = tuple_geno_to_int(geno) - geno_id = self._graph.get_node_id(int_geno) + geno_id = self._graph.get_node_id(geno) if not geno_id: continue @@ -441,7 +516,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int): # and each patient connects only to their own genos, so we wouldn't override the weight dict. # self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE self._genotype_candidates[patient_id][geno_id] = { - geno_num: (probability, 10) + geno_num: (probability, len(geno)) } # AMIT ADD # else: # print(f"Missing 'geno_num' for patient_id: {patient_id}") @@ -505,9 +580,10 @@ def score_matches( for hla_id, genotype_matches in self._genotype_candidates[ patient ].items(): # AMIT ADD + num_alleles = len(self.patients[patient]) for prob, matches in genotype_matches.values(): # AMIT CHANGE # match_info = (probability of patient's genotype, number of matches to patient's genotype) - if matches != 10 - mismatch: + if matches != num_alleles - mismatch: continue # add the probabilities multiplication of the patient and all the donors that has this genotype @@ -568,38 +644,27 @@ def __append_matching_donor( mm_number: int, ) -> None: """add a donor to the matches dictionary""" + pat = self.patients[patient].np() + don = self.get_most_common_genotype(donor) + compare_commons, gvh, hvg = locuses_match_between_genos(pat, don) - compare_commons = locuses_match_between_genos( - self.patients[patient], self.get_most_common_genotype(donor) - ) - - add_donors["Patient_ID"].append(patient) - add_donors["Donor_ID"].append(donor) + add_donors["Patient_ID"].append(-1 * patient) + add_donors["Donor_ID"].append(-1 * donor) allele_prob = self.probability_to_allele( don_id=donor, pat_geno=self.patients[patient] ) - add_donors["Match_Probability_A_1"].append(allele_prob[0]) - add_donors["Match_Probability_A_2"].append(allele_prob[1]) - add_donors["Match_Probability_B_1"].append(allele_prob[2]) - add_donors["Match_Probability_B_2"].append(allele_prob[3]) - add_donors["Match_Probability_C_1"].append(allele_prob[4]) - add_donors["Match_Probability_C_2"].append(allele_prob[5]) - add_donors["Match_Probability_DQB1_1"].append(allele_prob[6]) - add_donors["Match_Probability_DQB1_2"].append(allele_prob[7]) - add_donors["Match_Probability_DRB1_1"].append(allele_prob[8]) - add_donors["Match_Probability_DRB1_2"].append(allele_prob[9]) - - add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0]) - add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1]) - add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2]) - add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3]) - add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4]) + add_donors["Match_Probability"].append(allele_prob) + add_donors["Match_Between_Most_Commons"].append(compare_commons) add_donors["Matching_Probability"].append(match_prob) - add_donors["Number_Of_Mismatches"].append(mm_number) - add_donors["Permissive/Non-Permissive"].append( - "-" - ) # TODO: add permissiveness algorithm + actual_mismatches = sum(compare_commons) + + add_donors["Number_Of_Mismatches"].append(actual_mismatches) + add_donors["GvH_Mismatches"].append(gvh) + add_donors["HvG_Mismatches"].append(hvg) + + add_donors["Permissive/Non-Permissive"].append("-") + # TODO: add permissiveness algorithm # add the other given fields to the results for field in donors_info: diff --git a/grma/match/graph_wrapper.py b/grma/match/graph_wrapper.py index c855ae5..feb3080 100644 --- a/grma/match/graph_wrapper.py +++ b/grma/match/graph_wrapper.py @@ -5,9 +5,9 @@ from typing import Union import numpy as np - from grma.utilities.geno_representation import HashableArray from grma.match.lol_graph import LolGraph +from bidict import bidict NODES_TYPES = Union[int, HashableArray] @@ -58,11 +58,12 @@ def get_edge_data( ret = self._graph.get_edge_data(node1_num, node2_num) return default if ret == exception_val else ret - def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False): - node_num = self._map_node_to_number[node] if not search_lol_id else node + def class_neighbors( + self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10 + ): + node_num = self._map_node_to_number[node[0]] if not search_lol_id else node neighbors_list = self._graph.neighbors_unweighted(node_num) - - neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16) + neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint32) for i, neighbor in enumerate(neighbors_list): neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor) @@ -112,7 +113,7 @@ def neighbors( def neighbors_2nd(self, node): node_num = self._map_node_to_number[node] r0, r1 = self._graph.neighbors_2nd(node_num) - return r0[:-1], r1 + return r0, r1 def node_value_from_id(self, node_id: int) -> NODES_TYPES: """convert lol ID to node value""" @@ -122,5 +123,5 @@ def node_value_from_id(self, node_id: int) -> NODES_TYPES: @classmethod def from_pickle(cls, path: Union[str, PathLike]): - graph_dict = pickle.load(open(path, "rb")) - return cls(graph_dict) + graph_bdict = pickle.load(open(path, "rb")) + return cls(graph_bdict[0]), bidict(graph_bdict[1]) diff --git a/grma/match/lol_graph.pyx b/grma/match/lol_graph.pyx index 44c111f..c123d15 100644 --- a/grma/match/lol_graph.pyx +++ b/grma/match/lol_graph.pyx @@ -15,8 +15,8 @@ cdef class LolGraph: UINT[:] _index_list UINT[:] _neighbors_list FLOAT[:] _weights_list - UINT[:] _map_number_to_num_node - UINT16[:, :] _map_number_to_arr_node + INT[:] _map_number_to_num_node + UINT[:, :] _map_number_to_arr_node # Changed from UINT16[:, :] UINT _arrays_start bint directed bint weighted @@ -24,8 +24,8 @@ cdef class LolGraph: def __init__(self, UINT[:] index_list, UINT[:] neighbors_list, FLOAT[:] weights_list, - UINT[:] map_number_to_num_node, - UINT16[:, :] map_number_to_arr_node, + INT[:] map_number_to_num_node, + UINT[:, :] map_number_to_arr_node, # Changed from UINT16[:, :] UINT arrays_start, bint directed, bint weighted): self._index_list = index_list @@ -49,12 +49,12 @@ cdef class LolGraph: @cython.boundscheck(False) @cython.wraparound(False) - cpdef UINT16[:] arr_node_value_from_id(self, UINT node_id): + cpdef UINT[:] arr_node_value_from_id(self, UINT node_id): # Changed from UINT16[:] return self._map_number_to_arr_node[node_id - self._arrays_start] @cython.boundscheck(False) @cython.wraparound(False) - cpdef UINT num_node_value_from_id(self, UINT node_id): + cpdef INT num_node_value_from_id(self, UINT node_id): return self._map_number_to_num_node[node_id] @cython.boundscheck(False) @@ -108,7 +108,6 @@ cdef class LolGraph: return self._weights_list[idx + node2_index] return -1 - # get neighbors of specific node n @cython.boundscheck(False) @cython.wraparound(False) cpdef tuple neighbors_weighted(self, UINT node): @@ -152,9 +151,10 @@ cdef class LolGraph: """return the second degree neighbors of a node - neighbors of neighbors.""" cdef UINT idx, idx_end, i, j, pointer, neighbor_1st, idx_1st_neigh, idx_end_1st_neigh, neighbor_id cdef np.ndarray[UINT, ndim=1] neighbors_list_id, neighbors_id - cdef UINT16[:] arr - cdef np.ndarray[UINT16, ndim=2] neighbors_value + cdef UINT[:] arr # Changed from UINT16[:] + cdef np.ndarray[UINT, ndim=2] neighbors_value # Changed from UINT16 cdef UINT num_of_neighbors_2nd + cdef UINT loci_len idx = self._index_list[node] idx_end = self._index_list[node + 1] @@ -162,7 +162,6 @@ cdef class LolGraph: neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32) for i in range(idx, idx_end): neighbors_list_id[i - idx] = self._neighbors_list[i] - num_of_neighbors_2nd = self._weights_list[idx] neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32) @@ -177,11 +176,12 @@ cdef class LolGraph: neighbors_id[pointer] = self._neighbors_list[j] pointer += 1 - neighbors_value = np.zeros((num_of_neighbors_2nd - 1, 10), dtype=np.uint16) - for i in range(len(neighbors_id) - 1): + loci_len = self._map_number_to_arr_node.shape[1] + neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint32) # Changed from uint16 + for i in range(len(neighbors_id)): neighbor_id = neighbors_id[i] arr = self.arr_node_value_from_id(neighbor_id) - for j in range(10): + for j in range(loci_len): neighbors_value[i, j] = arr[j] return neighbors_id, neighbors_value diff --git a/grma/match/match.py b/grma/match/match.py index 302b71f..76f71e0 100644 --- a/grma/match/match.py +++ b/grma/match/match.py @@ -8,6 +8,8 @@ import pandas as pd from grim import grim import csv +import ast +from bidict import bidict from grma.match import Graph as MatchingGraph from grma.match.donors_matching import DonorsMatching, _init_results_df @@ -118,6 +120,7 @@ def find_matches( save_to_csv: bool = False, calculate_time: bool = False, output_dir: str = "output", + bdict: bidict = None, ): """ The main function responsible for performing the matching. @@ -146,7 +149,7 @@ def find_matches( if verbose: print_time("Start graph matching") - g_m = DonorsMatching(match_graph, verbose=verbose) + g_m = DonorsMatching(match_graph, verbose=verbose, bdict=bdict) # create patients graph and find all candidates start_build_graph = time.time() @@ -195,7 +198,22 @@ def find_matches( # the returned dictionary. {patient ID: pd.DataFrame(matches + features)} patients_results = {patient: None for patient in patients} - + with open(imputation_filename, "r") as f: + header = f.readline().strip() + # loci = list(dict.fromkeys([('DRB3/4/5' if item in ['DRB3', 'DRB4', 'DRB5'] else item) for item in line.split(',')[1].replace('*', ',').replace('+', ',').replace('^', ',').replace(':',',').split(',') if not item.isdigit() and item])) + geno_header = header.split(",")[1] # column with loci/genotype schema + + # Extract locus names: take substring before '*' for each '^'-separated locus + raw_loci = [seg.split("*", 1)[0].strip() for seg in geno_header.split("^")] + + # Keep unique order, drop empties/NNNN, and fold DRB3/4/5 + loci = list( + dict.fromkeys( + ("DRB3/4/5" if x in {"DRB3", "DRB4", "DRB5", "DRBX"} else x) + for x in raw_loci + if x and x != "N" + ) + ) if patients: avg_build_time = (end_build_graph - start_build_graph) / len(patients) else: @@ -217,6 +235,28 @@ def find_matches( patient, g_m, donors_info, threshold, cutoff, classes, subclasses ) + match_Probability = results_df["Match_Probability"] + match_Between_Most_Commons = results_df["Match_Between_Most_Commons"] + Permissive = results_df["Permissive/Non-Permissive"] + df_new = results_df.drop( + columns=[ + "Match_Probability", + "Match_Between_Most_Commons", + "Permissive/Non-Permissive", + ] + ).copy() + + for idx, row in enumerate(match_Probability): + k = 0 + for locus in loci: + for i in [1, 2]: + df_new.loc[idx, f"Match_Probability_{locus}_{i}"] = row[k] + k += 1 + df_new["Permissive/Non-Permissive"] = Permissive + for idx, row in enumerate(match_Between_Most_Commons): + for l, locus in enumerate(loci): + df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l] + results_df = df_new.copy() end = time.time() patient_time = end - start + avg_build_time @@ -224,7 +264,7 @@ def find_matches( patients_results[patient] = (results_df, patient_time) else: patients_results[patient] = results_df - + patient = -1 * patient if save_to_csv: # save results to csv results_df.to_csv( @@ -239,7 +279,6 @@ def find_matches( f"Saved Matching results for {patient} in " f"{os.path.join(f'{output_dir}/search_{search_id}', f'Patient_{patient}.csv')}" ) - return patients_results diff --git a/grma/utilities/cutils.pyx b/grma/utilities/cutils.pyx index b08b624..895bf06 100644 --- a/grma/utilities/cutils.pyx +++ b/grma/utilities/cutils.pyx @@ -9,8 +9,6 @@ ctypedef np.int8_t INT8 ctypedef np.uint16_t UINT16 ctypedef np.uint32_t UINT32 - - @cython.boundscheck(False) @cython.wraparound(False) cpdef np.ndarray[UINT32, ndim=2] cdrop_less_than_7_matches(np.ndarray[UINT32, ndim=1] ids, @@ -28,11 +26,10 @@ cpdef np.ndarray[UINT32, ndim=2] cdrop_less_than_7_matches(np.ndarray[UINT32, nd count += 1 return ret[:count, :] - @cython.boundscheck(False) @cython.wraparound(False) -cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] patients_geno, - np.ndarray[UINT16, ndim=2] donors_genos, +cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT32, ndim=1] patients_geno, # Changed from UINT16 + np.ndarray[UINT32, ndim=2] donors_genos, # Changed from UINT16 np.ndarray[UINT8, ndim=1] allele_range, UINT8 init_count_similar): """ Takes 2 genotypes to check similarity between \n @@ -45,9 +42,9 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati """ cdef: np.ndarray[INT8, ndim=1] similarities - np.ndarray[UINT16, ndim=1] donors_geno + np.ndarray[UINT32, ndim=1] donors_geno # Changed from UINT16 UINT8 count_similar, counted, number_of_alleles, allele_num - UINT16 patient_alleles0, patient_alleles1, donor_alleles0, donor_alleles1 + UINT32 patient_alleles0, patient_alleles1, donor_alleles0, donor_alleles1 # Changed from UINT16 similarities = np.zeros(len(donors_genos), dtype=np.int8) number_of_alleles = len(allele_range) @@ -83,17 +80,16 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati if counted - count_similar > 3: similarities[i] = -1 break - if 10 - count_similar > 3: + if len(donors_geno) - count_similar > 3: similarities[i] = -1 else: similarities[i] = count_similar return similarities - @cython.boundscheck(False) @cython.wraparound(False) -cpdef UINT32 chash(np.ndarray[UINT16, ndim=1] arr): +cpdef UINT32 chash(np.ndarray[UINT32, ndim=1] arr): cdef UINT32 h = 17 cdef UINT8 i for i in range(len(arr)): diff --git a/grma/utilities/geno_representation.py b/grma/utilities/geno_representation.py index 1773874..f1ec8d7 100644 --- a/grma/utilities/geno_representation.py +++ b/grma/utilities/geno_representation.py @@ -27,7 +27,7 @@ class HashableArray: def __init__(self, arr): self.it = None - self.arr = np.array(arr, dtype=np.uint16) + self.arr = np.array(arr, dtype=np.uint32) def __hash__(self): return chash(self.arr) diff --git a/grma/utilities/utils.py b/grma/utilities/utils.py index 3f52a1b..5de97d2 100644 --- a/grma/utilities/utils.py +++ b/grma/utilities/utils.py @@ -3,9 +3,16 @@ from datetime import datetime from typing import Iterable +import numpy as np + from grma.utilities.cutils import cdrop_less_than_7_matches, ccheck_similarity from collections.abc import Sequence +import re +import ast +from typing import List +import hashlib +from grma.utilities.geno_representation import HashableArray LENGTH_OF_NUMBERS_IN_ALLELE: int = 4 TO_SEROLOGY: dict[str, int] = { @@ -225,9 +232,8 @@ def allele_style( def tuple_geno_to_int(tuple_geno: Iterable[int]): """convert genotype from Iterable to integer""" string = "" - fill: int = 4 for i in tuple_geno: - string += str(i).zfill(fill) + string += str(i) return int(string) @@ -243,7 +249,10 @@ def drop_less_than_7_matches(ids, similarities): def check_similarity(patients_geno, donors_genos, allele_range, init_count_similar): return ccheck_similarity( - patients_geno, donors_genos, allele_range, init_count_similar + np.fromiter(patients_geno, dtype=np.uint32), + donors_genos, + allele_range, + init_count_similar, ) @@ -255,3 +264,25 @@ def gl_string_to_integers(genotype: str) -> Sequence[int]: ] return genotype + + +def gl_string_to_hash(s: str) -> int: + return int.from_bytes(hashlib.blake2b(s.encode(), digest_size=8).digest(), "big") + + +def geno_to_int(geno_str): + geno = tuple([a for block in geno_str.split("^") for a in block.split("+")]) + allele_list = [] + for item in geno: + locus, allele = item.split("*") + a1, a2 = allele.split(":") + if locus == "DRB3": + a1 = "3" + a1 + elif locus == "DRB4": + a1 = "4" + a1 + elif locus == "DRB5": + a1 = "5" + a1 + else: + a1 = "0" + a1 + allele_list.append(int("1" + a1 + a2)) + return HashableArray(allele_list)