1+ import difflib
12import itertools
23import logging
34import os
2728from Bio .Align import MultipleSeqAlignment
2829from Bio .Seq import Seq
2930from Bio .SeqRecord import SeqRecord
31+ from biopandas .pdb import PandasPdb
3032from pytorch_lightning .loggers import TensorBoardLogger , WandbLogger
3133
3234from project .utils .deepinteract_constants import FEAT_COLS , ALLOWABLE_FEATS , D3TO1
@@ -593,9 +595,11 @@ def copy_input_to_raw_dir(input_dataset_dir: str, pdb_filepath: str, pdb_code: s
593595 """Make a copy of the input PDB file in the newly-created raw directory."""
594596 filename = db .get_pdb_code (pdb_filepath ) + f'_{ chain_indic } .pdb' \
595597 if chain_indic not in pdb_filepath else db .get_pdb_name (pdb_filepath )
598+ new_filepath = os .path .join (input_dataset_dir , "raw" , pdb_code , filename )
596599 input_copy_cmd = f'cp { pdb_filepath } { os .path .join (input_dataset_dir , "raw" , pdb_code , filename )} '
597600 input_copy_proc = subprocess .Popen (input_copy_cmd .split (), stdout = subprocess .PIPE , cwd = os .getcwd ())
598601 _ , _ = input_copy_proc .communicate () # Wait until the input copy cmd is finished
602+ return new_filepath
599603
600604
601605def make_dataset (input_dataset_dir = 'datasets/Input/raw' , output_dir = 'datasets/Input/interim' , num_cpus = 1 ,
@@ -618,6 +622,41 @@ def make_dataset(input_dataset_dir='datasets/Input/raw', output_dir='datasets/In
618622 pair .all_complex_to_pairs (complexes , source_type , get_pairs , pairs_dir , num_cpus )
619623
620624
625+ def recover_any_missing_chain_ids (interim_dataset_dir : str , new_pdb_filepath : str ,
626+ orig_pdb_filepath : str , pdb_code : str , chain_number : int ):
627+ """Restore any missing chain IDs for the chain represented by the corresponding Pandas DataFrame."""
628+ orig_pdb_chain_id = '_' # Default value for missing chain IDs
629+ new_pdb_code = db .get_pdb_code (new_pdb_filepath )
630+ orig_pdb_name = db .get_pdb_name (orig_pdb_filepath )
631+ orig_pdb_df = PandasPdb ().read_pdb (new_pdb_filepath ).df ['ATOM' ]
632+ unique_chain_ids = np .unique (orig_pdb_df ['chain_id' ].values )
633+ """Ascertain the chain ID corresponding to the original PDB file, using one of two available methods.
634+ Method 1: Used with datasets such as EVCoupling adopting .atom filename extensions (e.g., 4DI3C.atom)
635+ Method 2: Used with datasets such as DeepHomo adopting regular .pdb filename extensions (e.g., 2FNUA.pdb)"""
636+ if len (unique_chain_ids ) == 1 and unique_chain_ids [0 ].strip () == '' : # Method 1: Try to use filename differences
637+ # No chain IDs were found, so we instead need to look to the original PDB filename to get the orig. chain ID
638+ pdb_code_diffs = difflib .ndiff (new_pdb_code , orig_pdb_name )
639+ for i , s in enumerate (pdb_code_diffs ):
640+ if s [0 ] == '+' :
641+ orig_pdb_chain_id = s [1 :].strip ()[0 ]
642+ break
643+ else : # Method 2: Try to use unique chain IDs
644+ # Assume the first/second index is the first non-empty chain ID (e.g., 'A')
645+ orig_pdb_chain_id = unique_chain_ids [0 ] if (unique_chain_ids [0 ] != '' ) else unique_chain_ids [1 ]
646+ # Update the existing Pair to contain the newly-recovered chain ID
647+ pair_dir = os .path .join (interim_dataset_dir , 'pairs' , pdb_code )
648+ pair_filenames = [os .path .join (pair_dir , filename ) for filename in os .listdir (pair_dir ) if new_pdb_code in filename ]
649+ # Load in the existing Pair
650+ with open (pair_filenames [0 ], 'rb' ) as f :
651+ pair = dill .load (f )
652+ # Update the corresponding chain ID
653+ pair .df0 .chain = orig_pdb_chain_id if chain_number == 1 else pair .df0 .chain
654+ pair .df1 .chain = orig_pdb_chain_id if chain_number == 2 else pair .df1 .chain
655+ # Save the updated Pair
656+ with open (pair_filenames [0 ], 'wb' ) as f :
657+ dill .dump (pair , f )
658+
659+
621660def generate_psaia_features (psaia_dir = '~/Programs/PSAIA_1.0_source/bin/linux/psa' ,
622661 psaia_config = 'datasets/builder/psaia_config_file_input.txt' ,
623662 pdb_dataset = 'datasets/Input/raw' , pkl_dataset = 'datasets/Input/interim/parsed' ,
@@ -727,9 +766,13 @@ def convert_input_pdb_files_to_pair(left_pdb_filepath: str, right_pdb_filepath:
727766 pdb_code = db .get_pdb_group (list (ca .get_complex_pdb_codes ([left_pdb_filepath , right_pdb_filepath ]))[0 ])
728767 # Iteratively execute the PDB file feature generation process
729768 create_input_dir_struct (input_dataset_dir , pdb_code )
730- copy_input_to_raw_dir (input_dataset_dir , left_pdb_filepath , pdb_code , 'l_u' )
731- copy_input_to_raw_dir (input_dataset_dir , right_pdb_filepath , pdb_code , 'r_u' )
769+ new_l_u_filepath = copy_input_to_raw_dir (input_dataset_dir , left_pdb_filepath , pdb_code , 'l_u' )
770+ new_r_u_filepath = copy_input_to_raw_dir (input_dataset_dir , right_pdb_filepath , pdb_code , 'r_u' )
732771 make_dataset (os .path .join (input_dataset_dir , 'raw' ), os .path .join (input_dataset_dir , 'interim' ))
772+ recover_any_missing_chain_ids (os .path .join (input_dataset_dir , 'interim' ),
773+ new_l_u_filepath , left_pdb_filepath , pdb_code , 1 )
774+ recover_any_missing_chain_ids (os .path .join (input_dataset_dir , 'interim' ),
775+ new_r_u_filepath , right_pdb_filepath , pdb_code , 2 )
733776 generate_psaia_features (psaia_dir = psaia_dir ,
734777 psaia_config = psaia_config ,
735778 pdb_dataset = os .path .join (input_dataset_dir , 'raw' ),
0 commit comments