openforcefield · mattwthompson · Nov 13, 2023 · Nov 17, 2023
@@ -17,14 +17,12 @@
     CurationWorkflowSchema,
 )
 from openff.evaluator.forcefield import ParameterGradientKey
-from openff.evaluator.layers.simulation import SimulationSchema
 from openff.evaluator.properties import (
     Density,
     DielectricConstant,
     EnthalpyOfMixing,
     EnthalpyOfVaporization,
     ExcessMolarVolume,
-    SolvationFreeEnergy,
 )
 from openff.evaluator.server import EvaluatorServer
 from openff.evaluator.storage import LocalFileStorage
@@ -62,18 +60,6 @@ def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet:
         )
 
         data_set.add_properties(
-            SolvationFreeEnergy(
-                thermodynamic_state=states[1],
-                phase=PropertyPhase.Liquid,
-                substance=ethanol_substance,
-                value=0.0 * SolvationFreeEnergy.default_unit(),
-            ),
-            SolvationFreeEnergy(
-                thermodynamic_state=states[1],
-                phase=PropertyPhase.Liquid,
-                substance=ethanal_substance,
-                value=0.0 * SolvationFreeEnergy.default_unit(),
-            ),
             *CurationWorkflow.apply(
                 PhysicalPropertyDataSet(),
                 CurationWorkflowSchema(
@@ -126,75 +112,6 @@ def define_data_set(reweighting: bool) -> PhysicalPropertyDataSet:
     return data_set
 
 
-def solvation_free_energy_schema() -> SimulationSchema:
-    """Override trailblazing to use the lambda values of used in the previous OFF study
-    https://github.com/MobleyLab/SMIRNOFF_paper_code/tree/master/FreeSolv
-    """
-
-    default_schema = SolvationFreeEnergy.default_simulation_schema()
-    workflow_schema = default_schema.workflow_schema
-
-    conditional_group_schema = next(
-        x for x in workflow_schema.protocol_schemas if x.id == "conditional_group"
-    )
-    conditional_group = conditional_group_schema.to_protocol()
-
-    yank_protocol = conditional_group.protocols["run_solvation_yank"]
-
-    yank_protocol.electrostatic_lambdas_1 = [
-        1.00,
-        0.75,
-        0.50,
-        0.25,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-        0.00,
-    ]
-    yank_protocol.steric_lambdas_1 = [
-        1.00,
-        1.00,
-        1.00,
-        1.00,
-        1.00,
-        0.95,
-        0.90,
-        0.80,
-        0.70,
-        0.60,
-        0.50,
-        0.40,
-        0.35,
-        0.30,
-        0.25,
-        0.20,
-        0.15,
-        0.10,
-        0.05,
-        0.00,
-    ]
-
-    yank_protocol.electrostatic_lambdas_2 = [1.00, 0.75, 0.50, 0.25, 0.00]
-    yank_protocol.steric_lambdas_2 = [1.00, 1.00, 1.00, 1.00, 1.00]
-
-    workflow_schema.protocol_schemas.remove(conditional_group_schema)
-    workflow_schema.protocol_schemas.append(conditional_group.schema)
-
-    return default_schema
-
-
 def main():
     setup_timestamp_logging()
 
@@ -212,7 +129,7 @@ def main():
             minimum_number_of_workers=1,
             maximum_number_of_workers=12,
             resources_per_worker=QueueWorkerResources(
-                number_of_gpus=1,
+                number_of_gpus=0,
                 preferred_gpu_toolkit=QueueWorkerResources.GPUToolkit.CUDA,
                 per_thread_memory_limit=5 * unit.gigabyte,
                 wallclock_time_limit="05:59",
@@ -230,7 +147,7 @@ def main():
             ):
                 client = EvaluatorClient()
 
-                for allowed_layer in ["SimulationLayer", "ReweightingLayer"]:
+                for allowed_layer in ["ReweightingLayer"]:
                     data_set = define_data_set(allowed_layer == "ReweightingLayer")
 
                     options = RequestOptions()
@@ -239,13 +156,6 @@ def main():
                         property_type: {} for property_type in data_set.property_types
                     }
 
-                    if allowed_layer == "SimulationLayer":
-                        options.add_schema(
-                            "SimulationLayer",
-                            "SolvationFreeEnergy",
-                            solvation_free_energy_schema(),
-                        )
-
                     request, _ = client.request_estimate(
                         data_set,
                         ForceField("openff-1.2.0.offxml"),

@@ -0,0 +1,16 @@
+import pandas
+
+from openff.evaluator.datasets.curation.components.freesolv import (
+    ImportFreeSolv,
+    ImportFreeSolvSchema,
+)
+
+
+def test_import_free_solv_data():
+    """Tests that the FreeSolv data set can be imported from a
+    remote source."""
+
+    free_solv_data_frame = ImportFreeSolv._download_free_solv()
+
+    data_frame = ImportFreeSolv.apply(pandas.DataFrame(), ImportFreeSolvSchema())
+    assert data_frame is not None and len(data_frame) == len(free_solv_data_frame)
@@ -404,7 +404,7 @@ def start(self):
             cores=self._resources_per_worker.number_of_threads,
             memory=memory_string,
             walltime=self._resources_per_worker.wallclock_time_limit,
-            job_extra=job_extra,
+            job_extra_directives=job_extra,
             env_extra=env_extra,
             extra=extra,
             local_directory="dask-worker-space",

@@ -0,0 +1,184 @@
+import io
+import logging
+import re
+from typing import List, Union
+
+import pandas
+import requests
+from typing_extensions import Literal
+
+from openff.evaluator.datasets import (
+    MeasurementSource,
+    PhysicalPropertyDataSet,
+    PropertyPhase,
+)
+from openff.evaluator.datasets.curation.components import (
+    CurationComponent,
+    CurationComponentSchema,
+)
+from openff.evaluator.substances import Component, ExactAmount, MoleFraction, Substance
+from openff.evaluator.thermodynamics import ThermodynamicState
+
+try:
+    from openff.evaluator.properties import SolvationFreeEnergy
+except ImportError:
+
+    class SolvationFreeEnergy:
+        pass
+
+
+logger = logging.getLogger(__name__)
+
+
+class ImportFreeSolvSchema(CurationComponentSchema):
+    type: Literal["ImportFreeSolv"] = "ImportFreeSolv"
+
+
+class ImportFreeSolv(CurationComponent):
+    """A component which will import the latest version of the FreeSolv
+    data set from the GitHub repository where it is stored.
+    """
+
+    @classmethod
+    def _download_free_solv(cls) -> pandas.DataFrame:
+        """Downloads the FreeSolv data set from GitHub.
+
+        Returns
+        -------
+            The Free Solv data stored in a pandas data frame.
+        """
+
+        # Download the database from GitHub
+        download_request = requests.get(
+            "https://raw.githubusercontent.com/MobleyLab/FreeSolv/master/database.txt"
+        )
+        download_request.raise_for_status()
+
+        text_contents = download_request.text
+
+        # Unify the delimiter
+        text_contents = text_contents.replace("; ", ";")
+
+        # Convert the set to a pandas object
+        text_buffer = io.StringIO(text_contents)
+        free_solv_data_frame = pandas.read_csv(text_buffer, delimiter=";", skiprows=2)
+
+        return free_solv_data_frame
+
+    @classmethod
+    def _validate_doi(cls, doi: str):
+        """Attempts to validate a string which may contain a (or multiple)
+        digital object identifier. If a valid DOI is not found, the FreeSolv
+        DOI itself is returned."""
+
+        fall_back_doi = "10.5281/zenodo.596537"
+
+        # From https://www.crossref.org/blog/dois-and-matching-regular-expressions/
+        doi_patterns = [
+            r"^10.\d{4,9}/[-._;()/:A-Z0-9]+$",
+            r"^10.1002/[^\s]+$",
+            r"^10.\d{4}/\d+-\d+X?(\d+)\d+<[\d\w]+:[\d\w]*>\d+.\d+.\w+;\d$",
+            r"^10.1021/\w\w\d+$",
+            r"^10.1207/[\w\d]+\&\d+_\d+$",
+        ]
+
+        # Split the string to try and catch concatenated DOIs
+        doi_split = doi.split(" and ")
+
+        matched_dois: List[str] = []
+
+        for split_doi in doi_split:
+            matched_doi = None
+
+            for doi_pattern in doi_patterns:
+                regex_match = re.match(doi_pattern, split_doi, re.I)
+
+                if not regex_match:
+                    continue
+
+                matched_doi = regex_match.group()
+                break
+
+            if not isinstance(matched_doi, str):
+                continue
+
+            matched_dois.append(matched_doi)
+
+        final_doi = (
+            fall_back_doi if len(matched_dois) == 0 else " + ".join(matched_dois)
+        )
+        return final_doi
+
+    @classmethod
+    def _apply(
+        cls,
+        data_frame: pandas.DataFrame,
+        schema: ImportFreeSolvSchema,
+        n_processes,
+    ) -> pandas.DataFrame:
+        from openff.units import unit
+
+        from openff.evaluator import properties, substances
+
+        # Convert the data frame into data rows.
+        free_solv_data_frame = cls._download_free_solv()
+
+        data_entries = []
+
+        for _, row in free_solv_data_frame.iterrows():
+            # Extract and standardize the SMILES pattern of the
+            solute_smiles = row["SMILES"].lstrip().rstrip()
+            solute_smiles = substances.Component(solute_smiles).smiles
+
+            # Build the substance.
+            substance = Substance()
+            substance.add_component(Component(smiles="O"), MoleFraction(1.0))
+            substance.add_component(
+                Component(smiles=solute_smiles, role=Component.Role.Solute),
+                ExactAmount(1),
+            )
+
+            # Extract the value and uncertainty
+            value = (
+                float(row["experimental value (kcal/mol)"])
+                * unit.kilocalorie
+                / unit.mole
+            )
+            std_error = (
+                float(row["experimental uncertainty (kcal/mol)"])
+                * unit.kilocalorie
+                / unit.mole
+            )
+
+            # Attempt to extract a DOI
+            original_source = row[
+                "experimental reference (original or paper this value was taken from)"
+            ]
+            doi = cls._validate_doi(original_source)
+
+            data_entry = SolvationFreeEnergy(
+                thermodynamic_state=ThermodynamicState(
+                    temperature=298.15 * unit.kelvin,
+                    pressure=101.325 * unit.kilopascal,
+                ),
+                phase=PropertyPhase.Liquid,
+                substance=substance,
+                value=value.to(properties.SolvationFreeEnergy.default_unit()),
+                uncertainty=std_error.to(properties.SolvationFreeEnergy.default_unit()),
+                source=MeasurementSource(doi=doi),
+            )
+            data_entries.append(data_entry)
+
+        data_set = PhysicalPropertyDataSet()
+        data_set.add_properties(*data_entries)
+
+        free_solv_data_frame = data_set.to_pandas()
+
+        data_frame = pandas.concat(
+            [data_frame, free_solv_data_frame], ignore_index=True, sort=False
+        )
+
+        return data_frame
+
+
+FreeSolvComponentSchema = Union[ImportFreeSolvSchema]
@@ -13,6 +13,9 @@
 from openff.evaluator.datasets.curation.components.filtering import (
     FilterComponentSchema,
 )
+from openff.evaluator.datasets.curation.components.freesolv import (
+    FreeSolvComponentSchema,
+)
 from openff.evaluator.datasets.curation.components.selection import (
     SelectionComponentSchema,
 )
@@ -31,6 +34,7 @@ class CurationWorkflowSchema(BaseModel):
         Union[
             ConversionComponentSchema,
             FilterComponentSchema,
+            FreeSolvComponentSchema,
             SelectionComponentSchema,
             ThermoMLComponentSchema,
         ]