diff --git a/data/emin.mdp b/data/emin.mdp deleted file mode 100644 index af67f5fa..00000000 --- a/data/emin.mdp +++ /dev/null @@ -1,23 +0,0 @@ -title = CHARMM steepest descent enrgy minimisation - -; Parameters describing what to do, when to stop and what to save -integrator = steep ; Algorithm (steep = steepest descent minimization) -emtol = 1000.0 ; Stop minimization when the maximum force < 1000.0 kJ/mol/nm -emstep = 0.01 ; Minimization step size -nstenergy = 500 ; save energies every 1.0 ps, so we can observe if we are successful -nsteps = 100 ; run as long as we need -ld-seed = -1; -continuation = no ; - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - diff --git a/data/emin_charmm.mdp b/data/emin_charmm.mdp deleted file mode 100644 index af67f5fa..00000000 --- a/data/emin_charmm.mdp +++ /dev/null @@ -1,23 +0,0 @@ -title = CHARMM steepest descent enrgy minimisation - -; Parameters describing what to do, when to stop and what to save -integrator = steep ; Algorithm (steep = steepest descent minimization) -emtol = 1000.0 ; Stop minimization when the maximum force < 1000.0 kJ/mol/nm -emstep = 0.01 ; Minimization step size -nstenergy = 500 ; save energies every 1.0 ps, so we can observe if we are successful -nsteps = 100 ; run as long as we need -ld-seed = -1; -continuation = no ; - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - diff --git a/data/md.mdp b/data/md.mdp deleted file mode 100644 index cd2b6cf0..00000000 --- a/data/md.mdp +++ /dev/null @@ -1,50 +0,0 @@ -title = CHARMM NPT - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 500000 ; 1000 ps = 1 ns - - -; periodic boundary condition -pbc = xyz ; - -continuation = yes - -; Output control - output frequency in steps -; Output frequency for output trajctory file ,trr -nstxout = 0 ; for writing coords (x) -nstvout = 0 ; for writing velocities (v) -nstfout = 0 ; for writing forces (f) -; Output frequency for energies to log file and energy file -nstlog = 50 ; for writing energies to log file -nstenergy = 500 ; for writing energies to edr file -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) -ld-seed = -1; - -; Pressure coupling is on -pcoupl = C-rescale ; Pressure coupling on in NPT -pcoupltype = isotropic ; uniform scaling of box vectors -tau_p = 5.0 ; time constant, in ps -ref_p = 1.0 ; reference pressure, in bar -compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 -refcoord_scaling = com - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - diff --git a/data/md_charmm.mdp b/data/md_charmm.mdp deleted file mode 100644 index 1e41dfec..00000000 --- a/data/md_charmm.mdp +++ /dev/null @@ -1,49 +0,0 @@ -title = CHARMM NPT - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 500000 ; 1000 ps = 1 ns - -; periodic boundary condition -pbc = xyz ; - -continuation = yes - -; Output control - output frequency in steps -; Output frequency for output trajctory file ,trr -nstxout = 0 ; for writing coords (x) -nstvout = 0 ; for writing velocities (v) -nstfout = 0 ; for writing forces (f) -; Output frequency for energies to log file and energy file -nstlog = 50 ; for writing energies to log file -nstenergy = 500 ; for writing energies to edr file -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) -ld-seed = -1; - -; Pressure coupling is on -pcoupl = C-rescale ; Pressure coupling on in NPT -pcoupltype = isotropic ; uniform scaling of box vectors -tau_p = 5.0 ; time constant, in ps -ref_p = 1.0 ; reference pressure, in bar -compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 -refcoord_scaling = com - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - diff --git a/data/npt.mdp b/data/npt.mdp deleted file mode 100644 index 1f02ecd3..00000000 --- a/data/npt.mdp +++ /dev/null @@ -1,50 +0,0 @@ -title = CHARMM NPT equilibration -define = -DPOSRES ; position restrain the protein - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 50000 ; 2 * 50000 = 100 ps -nstenergy = 500 ; save energy and temperature every 1.0 ps - -nstlog = 50 - -; periodic boundary condition -pbc = xyz ; - -continuation = yes -ld-seed = -1 ; - -; Pressure coupling is on -pcoupl = C-rescale ; Pressure coupling on in NPT -pcoupltype = isotropic ; uniform scaling of box vectors -tau_p = 5.0 ; time constant, in ps -ref_p = 1.0 ; reference pressure, in bar -compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 -refcoord_scaling = com - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; coupling groups -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - - - - - diff --git a/data/npt_charmm.mdp b/data/npt_charmm.mdp deleted file mode 100644 index d2fffcec..00000000 --- a/data/npt_charmm.mdp +++ /dev/null @@ -1,50 +0,0 @@ -title = CHARMM NPT equilibration -define = -DPOSRES ; position restrain the protein - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 50000 ; 2 * 50000 = 100 ps -nstenergy = 500 ; save energy and temperature every 1.0 ps - -nstlog = 50 - -; periodic boundary condition -pbc = xyz ; - -continuation = yes -ld-seed = -1 - -; Pressure coupling is on -pcoupl = C-rescale ; Pressure coupling on in NPT -pcoupltype = isotropic ; uniform scaling of box vectors -tau_p = 5.0 ; time constant, in ps -ref_p = 1.0 ; reference pressure, in bar -compressibility = 4.5e-5 ; isothermal compressibility of water, bar^-1 -refcoord_scaling = com - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; coupling groups -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - - - - - diff --git a/data/nvt.mdp b/data/nvt.mdp deleted file mode 100644 index 0dc1e33e..00000000 --- a/data/nvt.mdp +++ /dev/null @@ -1,45 +0,0 @@ -title = CHARMM NVT equilibration -define = -DPOSRES ; position restrain the protein - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 50000 ; 2 * 50000 = 100 ps -nstenergy = 500 ; save energy and temperature every 1.0 ps - -nstlog = 50 - -; periodic boundary condition -pbc = xyz ; -continuation = no -ld-seed = -1 - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; coupling groups -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Pressure coupling is off -pcoupl = no - -; Velocity generation -gen_vel = yes ; assign velocities from Maxwell distribution -gen_temp = 300 ; temperature for Maxwell distribution -gen_seed = -1 ; - -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - diff --git a/data/nvt_charmm.mdp b/data/nvt_charmm.mdp deleted file mode 100644 index 2a368d42..00000000 --- a/data/nvt_charmm.mdp +++ /dev/null @@ -1,44 +0,0 @@ -title = CHARMM NVT equilibration -define = -DPOSRES ; position restrain the protein - -; Parameters describing what to do, when to stop and what to save -integrator = md ; leap-frog integrator -dt = 0.002 ; 2 fs -nsteps = 50000 ; 2 * 50000 = 100 ps -nstenergy = 500 ; save energy and temperature every 1.0 ps - -nstlog = 50 - -; periodic boundary condition -pbc = xyz ; -ld-seed = -1; - -; Keep system temperature fluctuating physically correct -tcoupl = V-rescale ; modified Berendsen thermostat -tc-grps = system ; coupling groups -tau_t = 1.0 ; time constant, in ps -ref_t = 300 ; reference temperature, one for each group, in K - -; Pressure coupling is off -pcoupl = no ; -continuation = no ; - -; Velocity generation -gen_vel = yes ; assign velocities from Maxwell distribution -gen_temp = 300 ; temperature for Maxwell distribution -gen_seed = -1 -; Output frequency and precision for .xtc file -nstxout-compressed = 2500 ; for writing coords (x) - -; Settings that make sure we run with parameters in harmony with the selected force-field -constraints = h-bonds ; bonds involving H are constrained -rcoulomb = 1.2 ; short-range electrostatic cutoff (in nm) -rvdw = 1.2 ; short-range van der Waals cutoff (in nm) -vdw-modifier = Force-switch ; specific CHARMM -rvdw_switch = 1.0 ; -DispCorr = no ; account for cut-off vdW scheme - -;in case of CHARMM DispCorr = EnerPres only for monolayers -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -fourierspacing = 0.15 ; grid spacing for FFT - - diff --git a/data/rerun.mdp b/data/rerun.mdp deleted file mode 100644 index 80891a8e..00000000 --- a/data/rerun.mdp +++ /dev/null @@ -1,44 +0,0 @@ -; Run parameters -integrator = md ; Algorithm (Leap-frog integrator for rerun) -nsteps = 0 ; 0 steps for rerun - -; Output control -nstxout = 100 ; suppress output of coordinates -nstvout = 0 ; suppress output of velocities -nstfout = 0 ; suppress output of forces -nstlog = 500 ; frequency to write logs -nstenergy = 100 ; frequency to write energies - -; Bond parameters -continuation = yes ; Initial simulation done, continuing -constraint_algorithm = lincs ; holonomic constraints -constraints = all-bonds ; all bonds (even heavy atom-H bonds) constrained -lincs_iter = 1 ; accuracy of LINCS -lincs_order = 4 ; also related to accuracy - -; Neighbor searching -ns_type = grid ; search neighboring grid cels -nstlist = 20 ; 20 fs, frequency to update neighbor list -rlist = 1.0 ; cut-off for making neighbor list (nm) - -; Electrostatics and VdW -coulombtype = PME ; Particle Mesh Ewald for long-range electrostatics -rcoulomb = 1.0 ; nm for electrostatic cut-off -rvdw = 1.0 ; nm for van der Waals cut-off -pme_order = 4 ; cubic interpolation -fourierspacing = 0.16 ; grid spacing for FFT - -; Temperature coupling is off -tcoupl = no ; No temperature coupling - -; Pressure coupling is off -pcoupl = no ; No pressure coupling - -; Periodic boundary conditions -pbc = xyz ; 3-D PBC - -; Dispersion correction -DispCorr = EnerPres ; Energy and pressure corrections - -; Velocity generation -gen_vel = no ; Velocity generation is off diff --git a/environment.yml b/environment.yml index d686a0f2..947372e5 100644 --- a/environment.yml +++ b/environment.yml @@ -18,4 +18,5 @@ dependencies: - numpy==1.26.4 - parmed==4.2.2 - plotly==5.22.0 - - kaleido==0.2.1 \ No newline at end of file + - kaleido==0.2.1 + - Bio==1.6.2 \ No newline at end of file diff --git a/folding/experiments/unfolding_analysis.py b/folding/experiments/unfolding_analysis.py new file mode 100644 index 00000000..eb6e9c68 --- /dev/null +++ b/folding/experiments/unfolding_analysis.py @@ -0,0 +1,322 @@ +### This script is used to run a sensitivity search for a protein folding simulation. +### Used for benchmarking. + +import os +import time +import wandb +import random +from typing import Dict, List +from tqdm import tqdm +from pathlib import Path +import bittensor as bt + +import pandas as pd +from box import Box # install using pip install box +import copy + +import plotly.express as px +import openmm as mm +import openmm.app as app +import openmm.unit as unit + +from folding.base.simulation import OpenMMSimulation +from folding.validators.hyperparameters import HyperParameters +from folding.utils.opemm_simulation_config import SimulationConfig +from folding.validators.protein import Protein +from folding.utils.reporters import ExitFileReporter, LastTwoCheckpointsReporter +from folding.utils.config import config +import numpy as np + +ROOT_DIR = Path(__file__).resolve().parents[2] + +ROOT_PATH = Path(__file__).parent +SEED = 42 +SIMULATION_STEPS = {"nvt": 1_000_000} + + +def log_event(event=None, protein_vis=None): + """Log the event to the console and to the wandb logger.""" + # bt.logging.info(f"Event: {event}") + if event: + wandb.log(event) + if protein_vis: + wandb.log({"protein_vis": wandb.Molecule(protein_vis)}) + + +def create_wandb_run( + pdb_id: str, + project: str = "folding-openmm", + entity: str = "macrocosmos", + tags: List = [], +): + wandb.init(project=project, entity=entity, tags=tags, name=pdb_id) + + +def configure_commands( + state: str, + seed: int, + system_config: SimulationConfig, + pdb_obj: app.PDBFile, + output_dir: str, + pdb_id: str, + angle: int, + CHECKPOINT_INTERVAL: int = 10000, + STATE_DATA_REPORTER_INTERVAL: int = 10, + EXIT_REPORTER_INTERVAL: int = 10, +) -> Dict[str, List[str]]: + simulation, _ = OpenMMSimulation().create_simulation( + pdb=pdb_obj, + system_config=system_config.get_config(), + seed=seed, + state=state, + ) + simulation.reporters.append( + LastTwoCheckpointsReporter( + file_prefix=f"{output_dir}/{state}", + reportInterval=CHECKPOINT_INTERVAL, + ) + ) + simulation.reporters.append( + app.StateDataReporter( + file=f"{output_dir}/{state}_{seed}.log", + reportInterval=STATE_DATA_REPORTER_INTERVAL, + step=True, + potentialEnergy=True, + ) + ) + simulation.reporters.append( + ExitFileReporter( + filename=f"{output_dir}/{state}", + reportInterval=EXIT_REPORTER_INTERVAL, + file_prefix=state, + ) + ) + simulation.reporters.append( + app.PDBReporter( + file=f"{output_dir}/folded_{angle}_{seed}.pdb", + reportInterval=CHECKPOINT_INTERVAL, + ) + ) + + return simulation + + +def create_random_modifications_to_system_config() -> Dict: + """create modifications of the desired parameters. + + Looks at the base bittensor config and parses parameters that we have deemed to be + valid for random sampling. This is to increase the problem space for miners. + """ + sampler = lambda min_val, max_val: round(np.random.uniform(min_val, max_val), 2) + + system_kwargs = {"temperature": sampler(200, 400), "friction": sampler(0.9, 1.1)} + + return system_kwargs + + +def create_new_challenge(pdb_id: str, angles=(180, 180)) -> Dict: + """Create a new challenge by sampling a random pdb_id and running a hyperparameter search + using the try_prepare_challenge function. + Args: + exclude (List): list of pdb_ids to exclude from the search + Returns: + Dict: event dictionary containing the results of the hyperparameter search + """ + + forward_start_time = time.time() + + # Perform a hyperparameter search until we find a valid configuration for the pdb + # bt.logging.warning(f"Attempting to prepare challenge for pdb {pdb_id}") + protein, event = try_prepare_challenge(pdb_id=pdb_id, angles=angles) + + if event.get("validator_search_status"): + bt.logging.success(f"✅✅ Successfully created challenge for pdb_id {pdb_id} ✅✅") + else: + # forward time if validator step fails + event["hp_search_time"] = time.time() - forward_start_time + bt.logging.error( + f"❌❌ All hyperparameter combinations failed for pdb_id {pdb_id}.. Skipping! ❌❌" + ) + + return protein, event + + +def try_prepare_challenge(pdb_id: str, angles=(180, 180)) -> Dict: + """Attempts to setup a simulation environment for the specific pdb & config + Uses a stochastic sampler to find hyperparameters that are compatible with the protein + """ + + # exclude_in_hp_search = parse_config(config) + hp_sampler = HyperParameters() + + bt.logging.info(f"Searching parameter space for pdb {pdb_id}") + + system_kwargs = create_random_modifications_to_system_config() + + for tries in tqdm( + range(hp_sampler.TOTAL_COMBINATIONS), total=hp_sampler.TOTAL_COMBINATIONS + ): + hp_sampler_time = time.time() + + event = {"hp_tries": tries} + sampled_combination: Dict = hp_sampler.sample_hyperparameters() + + hps = { + "ff": sampled_combination["FF"], + "water": sampled_combination["WATER"], + "box": sampled_combination["BOX"], + } + + config = Box({"force_use_pdb": False, "input_source": "rcsb"}) + + protein = Protein( + pdb_id=pdb_id, config=config, system_kwargs=system_kwargs, **hps + ) + + try: + protein.setup_simulation(angles=angles) + + except Exception as e: + bt.logging.error(f"Error occurred for pdb_id {pdb_id}: {e}") + event["validator_search_status"] = False + + finally: + event["pdb_id"] = pdb_id + event.update(hps) # add the dictionary of hyperparameters to the event + event["hp_sample_time"] = time.time() - hp_sampler_time + event["pdb_complexity"] = [dict(protein.pdb_complexity)] + event["init_energy"] = protein.init_energy + event["angle"] = angles[0] + event["system_kwargs"] = system_kwargs + + if "validator_search_status" not in event: + bt.logging.warning("✅✅ Simulation ran successfully! ✅✅") + event["validator_search_status"] = True # simulation passed! + # break out of the loop if the simulation was successful + break + if tries == 3: + bt.logging.error(f"Max tries reached for pdb_id {pdb_id} :x::x:") + break + + return protein, event + + +def sample_pdb(exclude: List = [], pdb_id: str = None): + return pdb_id or select_random_pdb_id(PDB_IDS, exclude=exclude) + + +def extact_energies(state: str, data_directory: str, num_experiments: int): + check_log_file = pd.read_csv( + os.path.join(data_directory, f"{state}_{num_experiments}.log") + ) + + return check_log_file["Potential Energy (kJ/mole)"].values + + +def cpt_file_mapper(output_dir: str, state: str, angles: tuple): + if state == "nvt": + return f"{output_dir}/em.cpt" + + if "npt" in state: + state = "nvt" + state.split("npt")[-1] + + if "md" in state: + state = "npt" + state.split("md_0_1")[-1] + + return f"{output_dir}/{state}.cpt" + + +if __name__ == "__main__": + angles = [(0, 0), (30, 30), (60, 60), (90, 90), (120, 120), (150, 150)] + + pdbs_to_exclude = [] + pdb_ids = ["5kxs", "6qdy"] + for pdb_id in pdb_ids: + create_wandb_run( + project="folding-openmm", + entity="macrocosmos", + tags=["unfolding"], + pdb_id=pdb_id, + ) + + for angle in angles: + num_experiments = 3 + while num_experiments > 0: + try: + protein, event = create_new_challenge(pdb_id=pdb_id, angles=angle) + except Exception as e: + bt.logging.error(f"Error occurred for pdb_id {pdb_id}: {e}") + continue + + event["num_experiment"] = num_experiments + pdb_obj: app.PDBFile = protein.load_pdb_file(protein.pdb_location) + output_dir = protein.validator_directory + + system_config = copy.deepcopy(protein.system_config) + + for state, steps_to_run in SIMULATION_STEPS.items(): + # Creates the simulation object needed for the stage. + partitions = steps_to_run // 100_000 + + temp_state = state + f"_angle_{angle}" + + simulation = configure_commands( + state=temp_state, + seed=num_experiments, + system_config=system_config, + pdb_obj=pdb_obj, + output_dir=protein.validator_directory, + pdb_id=pdb_id, + angle=angle[0], + ) + + bt.logging.info( + f"Running {state} for {steps_to_run} steps for pdb {pdb_id}" + ) + if state == "nvt": + mapper_state = state + else: + mapper_state = temp_state + + simulation.loadCheckpoint( + cpt_file_mapper(output_dir, mapper_state, angle) + ) + + start_time = time.time() + for _ in range(partitions): + simulation.step(100_000) + event["energy"] = ( + simulation.context.getState( + getEnergy=True + ).getPotentialEnergy() + / unit.kilojoules_per_mole + ) + event["time"] = time.time() - start_time + event["step"] = simulation.currentStep + log_event( + event, + protein_vis=f"{protein.validator_directory}/folded_{angle[0]}_{num_experiments}.pdb", + ) + simulation_time = time.time() - start_time + event[f"{state}_time"] = simulation_time + + energy_array = extact_energies( + state=temp_state, + data_directory=protein.validator_directory, + num_experiments=num_experiments, + ) + # event[f"{state}_energies_angle_{angle}"] = energy_array.tolist() + + fig = px.scatter( + energy_array, + title=f"Energy array for {pdb_id} for state {state} for angle {angle}", + labels={"index": "Step", "value": "energy"}, + height=600, + width=1400, + ) + fig.write_image( + os.path.join(output_dir, f"{mapper_state}_energy.png") + ) + + log_event(event) + num_experiments = num_experiments - 1 diff --git a/folding/unfolding/unfold.py b/folding/unfolding/unfold.py new file mode 100644 index 00000000..988bf3ad --- /dev/null +++ b/folding/unfolding/unfold.py @@ -0,0 +1,104 @@ +from typing import List + +import Bio +from Bio.PDB.vectors import rotaxis +from Bio.PDB import PDBParser, PDBIO, PPBuilder + +import numpy as np + + +def rotate_atoms(atoms: List[Bio.PDB.Atom.Atom], rotation_axis, rotation_point, angle): + """ + Rotate atoms around a rotation axis by a given angle. + """ + rotation_matrix = rotaxis(angle, rotation_axis) + for atom in atoms: + coord = atom.get_vector() - rotation_point + new_coord = coord.left_multiply(rotation_matrix) + rotation_point + + # This should be an in-place operation + atom.set_coord(new_coord.get_array()) + + +def modify_phi_angle(residue, pp, position, delta_phi): + # Modify phi angle + if residue.has_id("N") and residue.has_id("CA"): + N = residue["N"].get_vector() + CA = residue["CA"].get_vector() + axis_phi = (CA - N).normalized() + + # Atoms to rotate for phi: all atoms after CA + atoms_to_rotate_phi = [] + for res in pp[position:]: + for atom in res.get_atoms(): + atoms_to_rotate_phi.append(atom) + rotate_atoms( + atoms=atoms_to_rotate_phi, + rotation_axis=axis_phi, + rotation_point=N, + angle=delta_phi, + ) + + +def modify_psi_angle(residue, pp, position, delta_psi): + # Modify psi angle + if residue.has_id("CA") and residue.has_id("C"): + CA = residue["CA"].get_vector() + C = residue["C"].get_vector() + axis_psi = (C - CA).normalized() + + # Atoms to rotate for psi: all atoms after C + atoms_to_rotate_psi = [] + for res in pp[position + 1 :]: + for atom in res.get_atoms(): + atoms_to_rotate_psi.append(atom) + rotate_atoms( + atoms=atoms_to_rotate_psi, + rotation_axis=axis_psi, + rotation_point=CA, + angle=delta_psi, + ) + + +def unfold_protein(pdb_location: str, output_location: str, pbc: str = None, angle: int = 180,): + """Method to unfold the protein""" + parser = PDBParser() + structure = parser.get_structure(id="protein", file=pdb_location) + ppb = PPBuilder() + peptides = ppb.build_peptides(entity=structure, aa_only=1) + # Iterate over all peptides + for peptide in peptides: + phi_psi = peptide.get_phi_psi_list() + for i, residue in enumerate(peptide): + phi, psi = phi_psi[i] + # Skip residues where phi or psi is None + if phi is None or psi is None: + continue + + # Define new angles to simulate unfolding (e.g., set to 180 degrees) + new_phi = np.deg2rad(angle) + new_psi = np.deg2rad(angle) + + # Calculate angle differences + delta_phi = new_phi - phi + delta_psi = new_psi - psi + + # Modify phi angle + modify_phi_angle( + residue=residue, pp=peptide, position=i, delta_phi=delta_phi + ) + + # Modify psi angle + modify_psi_angle( + residue=residue, pp=peptide, position=i, delta_psi=delta_psi + ) + + io = PDBIO() + io.set_structure(structure) + io.save(pdb_location) + if pbc: + with open(pdb_location, "r") as f: + pdb = f.read() + pdb_with_pbc = pbc + "\n" + pdb + with open(pdb_location, "w") as f: + f.write(pdb_with_pbc) diff --git a/folding/utils/logging.py b/folding/utils/logging.py index b134c98d..312dea0a 100644 --- a/folding/utils/logging.py +++ b/folding/utils/logging.py @@ -112,6 +112,16 @@ def log_folded_protein(run, pdb_id_path: str): bt.logging.warning("Failed to log folded protein visualization") +def log_original_protein(run, pdb_id_path: str): + """Logs the folded protein visualization to wandb. + pdb_id_path: str: path to the pdb file on disk. + """ + try: + run.log({"original_protein_vis": wandb.Molecule(pdb_id_path)}) + except: + bt.logging.warning("Failed to log original protein visualization") + + def log_event( self, event, @@ -134,6 +144,9 @@ def log_event( if pdb_location is not None: log_protein(run, pdb_id_path=pdb_location) + log_original_protein( + run, pdb_id_path=pdb_location.replace(".pdb", "_original.pdb") + ) if folded_protein_location is not None: log_folded_protein(run, pdb_id_path=folded_protein_location) wandb.save(folded_protein_location) diff --git a/folding/utils/pdb_utils.py b/folding/utils/pdb_utils.py new file mode 100644 index 00000000..3b344023 --- /dev/null +++ b/folding/utils/pdb_utils.py @@ -0,0 +1,57 @@ +from typing import List + +import Bio +from Bio.PDB.vectors import rotaxis + + +def rotate_atoms(atoms: List[Bio.PDB.Atom.Atom], rotation_axis, rotation_point, angle): + """ + Rotate atoms around a rotation axis by a given angle. + """ + rotation_matrix = rotaxis(angle, rotation_axis) + for atom in atoms: + coord = atom.get_vector() - rotation_point + new_coord = coord.left_multiply(rotation_matrix) + rotation_point + + # This should be an in-place operation + atom.set_coord(new_coord.get_array()) + + +def modify_phi_angle(residue, pp, position, delta_phi): + # Modify phi angle + if residue.has_id("N") and residue.has_id("CA"): + N = residue["N"].get_vector() + CA = residue["CA"].get_vector() + axis_phi = (CA - N).normalized() + + # Atoms to rotate for phi: all atoms after CA + atoms_to_rotate_phi = [] + for res in pp[position:]: + for atom in res.get_atoms(): + atoms_to_rotate_phi.append(atom) + rotate_atoms( + atoms=atoms_to_rotate_phi, + rotation_axis=axis_phi, + rotation_point=N, + angle=delta_phi, + ) + + +def modify_psi_angle(residue, pp, position, delta_psi): + # Modify psi angle + if residue.has_id("CA") and residue.has_id("C"): + CA = residue["CA"].get_vector() + C = residue["C"].get_vector() + axis_psi = (C - CA).normalized() + + # Atoms to rotate for psi: all atoms after C + atoms_to_rotate_psi = [] + for res in pp[position + 1 :]: + for atom in res.get_atoms(): + atoms_to_rotate_psi.append(atom) + rotate_atoms( + atoms=atoms_to_rotate_psi, + rotation_axis=axis_psi, + rotation_point=CA, + angle=delta_psi, + ) diff --git a/folding/validators/forward.py b/folding/validators/forward.py index 03fbc79b..31bad9cb 100644 --- a/folding/validators/forward.py +++ b/folding/validators/forward.py @@ -140,7 +140,7 @@ def parse_config(config) -> Dict[str, str]: return exclude_in_hp_search -def create_new_challenge(self, exclude: List) -> Dict: +def create_new_challenge(self, exclude: List, angle=180) -> Dict: """Create a new challenge by sampling a random pdb_id and running a hyperparameter search using the try_prepare_challenge function. @@ -166,7 +166,7 @@ def create_new_challenge(self, exclude: List) -> Dict: # Perform a hyperparameter search until we find a valid configuration for the pdb bt.logging.info(f"Attempting to prepare challenge for pdb {pdb_id}") - event = try_prepare_challenge(config=self.config, pdb_id=pdb_id) + event = try_prepare_challenge(config=self.config, pdb_id=pdb_id, angle=angle) event["input_source"] = self.config.protein.input_source if event.get("validator_search_status"): @@ -201,7 +201,7 @@ def create_random_modifications_to_system_config(config) -> Dict: return system_kwargs -def try_prepare_challenge(config, pdb_id: str) -> Dict: +def try_prepare_challenge(config, pdb_id: str, angle=180) -> Dict: """Attempts to setup a simulation environment for the specific pdb & config Uses a stochastic sampler to find hyperparameters that are compatible with the protein """ @@ -254,12 +254,12 @@ def try_prepare_challenge(config, pdb_id: str) -> Dict: ) try: - protein.setup_simulation() + protein.setup_simulation(angle=angle) - if protein.init_energy > 0: - raise ValueError( - f"Initial energy is positive: {protein.init_energy}. Simulation failed." - ) + # if protein.init_energy > 0: + # raise ValueError( + # f"Initial energy is positive: {protein.init_energy}. Simulation failed." + # ) except Exception: # full traceback @@ -274,6 +274,7 @@ def try_prepare_challenge(config, pdb_id: str) -> Dict: event["init_energy"] = protein.init_energy event["epsilon"] = protein.epsilon event["system_kwargs"] = system_kwargs + event["angle"] = angle if "validator_search_status" not in event: bt.logging.success("✅✅ Simulation ran successfully! ✅✅") diff --git a/folding/validators/protein.py b/folding/validators/protein.py index 1328a5de..a68a1dfc 100644 --- a/folding/validators/protein.py +++ b/folding/validators/protein.py @@ -16,6 +16,7 @@ from openmm import app, unit from pdbfixer import PDBFixer + from folding.base.simulation import OpenMMSimulation from folding.store import Job from folding.utils.opemm_simulation_config import SimulationConfig @@ -27,6 +28,7 @@ write_pkl, load_and_sample_random_pdb_ids, ) +from folding.unfolding.unfold import unfold_protein ROOT_DIR = Path(__file__).resolve().parents[2] @@ -153,7 +155,16 @@ def gather_pdb_id(self): ) # TODO: This should be a class variable via config bt.logging.debug(f"Selected random pdb id: {self.pdb_id!r}") - def setup_pdb_directory(self): + def get_pbc(self): + with open(self.pdb_location, "r") as file: + for line in file: + if line.startswith("CRYST1"): + return ( + line.strip() + ) # Return the line without leading/trailing whitespace + return None + + def setup_pdb_directory(self, angle=180): # if directory doesn't exist, download the pdb file and save it to the directory if not os.path.exists(self.pdb_directory): os.makedirs(self.pdb_directory) @@ -170,6 +181,14 @@ def setup_pdb_directory(self): f"Failed to download {self.pdb_file} to {self.pdb_directory}" ) self.fix_pdb_file() + pbc = self.get_pbc() + unfold_protein( + pdb_location=self.pdb_location, + output_location=self.pdb_location, + pbc=pbc, + angle=angle, + ) + else: bt.logging.info( f"PDB file {self.pdb_file} already exists in path {self.pdb_directory!r}." @@ -197,7 +216,7 @@ def read_and_return_files(self, filenames: List) -> Dict: continue return files_to_return - def setup_simulation(self): + def setup_simulation(self, angle=180): """forward method defines the following: 1. gather the pdb_id and setup the namings. 2. setup the pdb directory and download the pdb file if it doesn't exist. @@ -211,7 +230,7 @@ def setup_simulation(self): ## Setup the protein directory and sample a random pdb_id if not provided self.gather_pdb_id() - self.setup_pdb_directory() + self.setup_pdb_directory(angle=angle) self.generate_input_files() diff --git a/neurons/validator.py b/neurons/validator.py index 641cb29c..deace6f6 100644 --- a/neurons/validator.py +++ b/neurons/validator.py @@ -167,6 +167,7 @@ def sample_random_uids( List[int]: A list of responding and free uids. """ active_uids = self.ping_all_miners(exclude_uids=exclude_uids) + return [88] if len(active_uids) > num_uids_to_sample: return random.sample(active_uids, num_uids_to_sample) @@ -181,7 +182,7 @@ def add_jobs(self, k: int): Args: k (int): The number of jobs create and distribute to miners. """ - + angles = [0, 30, 60, 90, 120] # Deploy K number of unique pdb jobs, where each job gets distributed to self.config.neuron.sample_size miners for ii in range(k): bt.logging.info(f"Adding job: {ii+1}/{k}") @@ -206,7 +207,9 @@ def add_jobs(self, k: int): if len(valid_uids) >= self.config.neuron.sample_size: # With the above logic, we know we have a valid set of uids. # selects a new pdb, downloads data, preprocesses and gets hyperparams. - job_event: Dict = create_new_challenge(self, exclude=exclude_pdbs) + job_event: Dict = create_new_challenge( + self, exclude=exclude_pdbs, angle=angles[ii] + ) job_event["uid_search_time"] = uid_search_time selected_hotkeys = [self.metagraph.hotkeys[uid] for uid in valid_uids]