Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 2 additions & 92 deletions integration-tests/default-workflows/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,12 @@
CurationWorkflowSchema,
)
from openff.evaluator.forcefield import ParameterGradientKey
from openff.evaluator.layers.simulation import SimulationSchema
from openff.evaluator.properties import (
Density,
DielectricConstant,
EnthalpyOfMixing,
EnthalpyOfVaporization,
ExcessMolarVolume,
SolvationFreeEnergy,
)
from openff.evaluator.server import EvaluatorServer
from openff.evaluator.storage import LocalFileStorage
Expand Down Expand Up @@ -62,18 +60,6 @@ def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet:
)

data_set.add_properties(
SolvationFreeEnergy(
thermodynamic_state=states[1],
phase=PropertyPhase.Liquid,
substance=ethanol_substance,
value=0.0 * SolvationFreeEnergy.default_unit(),
),
SolvationFreeEnergy(
thermodynamic_state=states[1],
phase=PropertyPhase.Liquid,
substance=ethanal_substance,
value=0.0 * SolvationFreeEnergy.default_unit(),
),
*CurationWorkflow.apply(
PhysicalPropertyDataSet(),
CurationWorkflowSchema(
Expand Down Expand Up @@ -126,75 +112,6 @@ def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet:
return data_set


def solvation_free_energy_schema() -> SimulationSchema:
"""Override trailblazing to use the lambda values of used in the previous OFF study
https://github.com/MobleyLab/SMIRNOFF_paper_code/tree/master/FreeSolv
"""

default_schema = SolvationFreeEnergy.default_simulation_schema()
workflow_schema = default_schema.workflow_schema

conditional_group_schema = next(
x for x in workflow_schema.protocol_schemas if x.id == "conditional_group"
)
conditional_group = conditional_group_schema.to_protocol()

yank_protocol = conditional_group.protocols["run_solvation_yank"]

yank_protocol.electrostatic_lambdas_1 = [
1.00,
0.75,
0.50,
0.25,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
0.00,
]
yank_protocol.steric_lambdas_1 = [
1.00,
1.00,
1.00,
1.00,
1.00,
0.95,
0.90,
0.80,
0.70,
0.60,
0.50,
0.40,
0.35,
0.30,
0.25,
0.20,
0.15,
0.10,
0.05,
0.00,
]

yank_protocol.electrostatic_lambdas_2 = [1.00, 0.75, 0.50, 0.25, 0.00]
yank_protocol.steric_lambdas_2 = [1.00, 1.00, 1.00, 1.00, 1.00]

workflow_schema.protocol_schemas.remove(conditional_group_schema)
workflow_schema.protocol_schemas.append(conditional_group.schema)

return default_schema


def main():
setup_timestamp_logging()

Expand All @@ -212,7 +129,7 @@ def main():
minimum_number_of_workers=1,
maximum_number_of_workers=12,
resources_per_worker=QueueWorkerResources(
number_of_gpus=1,
number_of_gpus=0,
preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA,
per_thread_memory_limit=5 * unit.gigabyte,
wallclock_time_limit="05:59",
Expand All @@ -230,7 +147,7 @@ def main():
):
client = EvaluatorClient()

for allowed_layer in ["SimulationLayer", "ReweightingLayer"]:
for allowed_layer in ["ReweightingLayer"]:
data_set = define_data_set(allowed_layer == "ReweightingLayer")

options = RequestOptions()
Expand All @@ -239,13 +156,6 @@ def main():
property_type: {} for property_type in data_set.property_types
}

if allowed_layer == "SimulationLayer":
options.add_schema(
"SimulationLayer",
"SolvationFreeEnergy",
solvation_free_energy_schema(),
)

request, _ = client.request_estimate(
data_set,
ForceField("openff-1.2.0.offxml"),
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pandas

from openff.evaluator.datasets.curation.components.freesolv import (
ImportFreeSolv,
ImportFreeSolvSchema,
)


def test_import_free_solv_data():
"""Tests that the FreeSolv data set can be imported from a
remote source."""

free_solv_data_frame = ImportFreeSolv._download_free_solv()

data_frame = ImportFreeSolv.apply(pandas.DataFrame(), ImportFreeSolvSchema())
assert data_frame is not None and len(data_frame) == len(free_solv_data_frame)
2 changes: 1 addition & 1 deletion openff/evaluator/backends/dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,7 +404,7 @@ def start(self):
cores=self._resources_per_worker.number_of_threads,
memory=memory_string,
walltime=self._resources_per_worker.wallclock_time_limit,
job_extra=job_extra,
job_extra_directives=job_extra,
env_extra=env_extra,
extra=extra,
local_directory="dask-worker-space",
Expand Down
184 changes: 184 additions & 0 deletions openff/evaluator/datasets/curation/components/freesolv.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import io
import logging
import re
from typing import List, Union

import pandas
import requests
from typing_extensions import Literal

from openff.evaluator.datasets import (
MeasurementSource,
PhysicalPropertyDataSet,
PropertyPhase,
)
from openff.evaluator.datasets.curation.components import (
CurationComponent,
CurationComponentSchema,
)
from openff.evaluator.substances import Component, ExactAmount, MoleFraction, Substance
from openff.evaluator.thermodynamics import ThermodynamicState

try:
from openff.evaluator.properties import SolvationFreeEnergy
except ImportError:

class SolvationFreeEnergy:
pass


logger = logging.getLogger(__name__)


class ImportFreeSolvSchema(CurationComponentSchema):
type: Literal["ImportFreeSolv"] = "ImportFreeSolv"


class ImportFreeSolv(CurationComponent):
"""A component which will import the latest version of the FreeSolv
data set from the GitHub repository where it is stored.
"""

@classmethod
def _download_free_solv(cls) -> pandas.DataFrame:
"""Downloads the FreeSolv data set from GitHub.

Returns
-------
The Free Solv data stored in a pandas data frame.
"""

# Download the database from GitHub
download_request = requests.get(
"https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt"
)
download_request.raise_for_status()

text_contents = download_request.text

# Unify the delimiter
text_contents = text_contents.replace("; ", ";")

# Convert the set to a pandas object
text_buffer = io.StringIO(text_contents)
free_solv_data_frame = pandas.read_csv(text_buffer, delimiter=";", skiprows=2)

return free_solv_data_frame

@classmethod
def _validate_doi(cls, doi: str):
"""Attempts to validate a string which may contain a (or multiple)
digital object identifier. If a valid DOI is not found, the FreeSolv
DOI itself is returned."""

fall_back_doi = "10.5281/zenodo.596537"

# From https://www.crossref.org/blog/dois-and-matching-regular-expressions/
doi_patterns = [
r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
r"^10.1002/[^\s]+$",
r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
r"^10.1021/\w\w\d+$",
r"^10.1207/[\w\d]+\&\d+_\d+$",
]

# Split the string to try and catch concatenated DOIs
doi_split = doi.split(" and ")

matched_dois: List[str] = []

for split_doi in doi_split:
matched_doi = None

for doi_pattern in doi_patterns:
regex_match = re.match(doi_pattern, split_doi, re.I)

if not regex_match:
continue

matched_doi = regex_match.group()
break

if not isinstance(matched_doi, str):
continue

matched_dois.append(matched_doi)

final_doi = (
fall_back_doi if len(matched_dois) == 0 else " + ".join(matched_dois)
)
return final_doi

@classmethod
def _apply(
cls,
data_frame: pandas.DataFrame,
schema: ImportFreeSolvSchema,
n_processes,
) -> pandas.DataFrame:
from openff.units import unit

from openff.evaluator import properties, substances

# Convert the data frame into data rows.
free_solv_data_frame = cls._download_free_solv()

data_entries = []

for _, row in free_solv_data_frame.iterrows():
# Extract and standardize the SMILES pattern of the
solute_smiles = row["SMILES"].lstrip().rstrip()
solute_smiles = substances.Component(solute_smiles).smiles

# Build the substance.
substance = Substance()
substance.add_component(Component(smiles="O"), MoleFraction(1.0))
substance.add_component(
Component(smiles=solute_smiles, role=Component.Role.Solute),
ExactAmount(1),
)

# Extract the value and uncertainty
value = (
float(row["experimental value (kcal/mol)"])
* unit.kilocalorie
/ unit.mole
)
std_error = (
float(row["experimental uncertainty (kcal/mol)"])
* unit.kilocalorie
/ unit.mole
)

# Attempt to extract a DOI
original_source = row[
"experimental reference (original or paper this value was taken from)"
]
doi = cls._validate_doi(original_source)

data_entry = SolvationFreeEnergy(
thermodynamic_state=ThermodynamicState(
temperature=298.15 * unit.kelvin,
pressure=101.325 * unit.kilopascal,
),
phase=PropertyPhase.Liquid,
substance=substance,
value=value.to(properties.SolvationFreeEnergy.default_unit()),
uncertainty=std_error.to(properties.SolvationFreeEnergy.default_unit()),
source=MeasurementSource(doi=doi),
)
data_entries.append(data_entry)

data_set = PhysicalPropertyDataSet()
data_set.add_properties(*data_entries)

free_solv_data_frame = data_set.to_pandas()

data_frame = pandas.concat(
[data_frame, free_solv_data_frame], ignore_index=True, sort=False
)

return data_frame


FreeSolvComponentSchema = Union[ImportFreeSolvSchema]
4 changes: 4 additions & 0 deletions openff/evaluator/datasets/curation/workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
from openff.evaluator.datasets.curation.components.filtering import (
FilterComponentSchema,
)
from openff.evaluator.datasets.curation.components.freesolv import (
FreeSolvComponentSchema,
)
from openff.evaluator.datasets.curation.components.selection import (
SelectionComponentSchema,
)
Expand All @@ -31,6 +34,7 @@ class CurationWorkflowSchema(BaseModel):
Union[
ConversionComponentSchema,
FilterComponentSchema,
FreeSolvComponentSchema,
SelectionComponentSchema,
ThermoMLComponentSchema,
]
Expand Down