From 9ffbe15dc38939fa0b71be9df5bacbab3a15a9e3 Mon Sep 17 00:00:00 2001 From: David Mobley Date: Sat, 10 Oct 2020 17:02:53 -0700 Subject: [PATCH 1/3] Add submission files, user maps --- .../logP/Analysis/SAMPL7-user-map-HG.csv | 35 +++ .../logP/Analysis/Scripts/get_usermap.py | 16 ++ .../Submissions/LogP_chemprop_submission.csv | 165 ++++++++++++ .../Submissions/logP-ChrisLoschen-1.csv | 118 ++++++++ .../logP-ChrisLoschen-1_5l9kQR3.csv | 118 ++++++++ .../Submissions/logP-ChrisLoschen-2.csv | 118 ++++++++ .../Analysis/Submissions/logP-DavyGuan-1.csv | 156 +++++++++++ .../Analysis/Submissions/logP-ECRISM-1.csv | 101 +++++++ .../Submissions/logP-EvrimArslan-6.csv | 60 +++++ .../Submissions/logP-FabioFalcioni-1.csv | 129 +++++++++ .../Analysis/Submissions/logP-IEFPCMMST-1.csv | 143 ++++++++++ .../logP-JudithWarnauDassaultSystemes.csv | 195 ++++++++++++++ .../Analysis/Submissions/logP-MLRUCR-1.csv | 144 ++++++++++ .../Submissions/logP-PieroProcacci-NES1-B.csv | 252 ++++++++++++++++++ .../Submissions/logP-PieroProcacci-NES1-G.csv | 200 ++++++++++++++ .../Submissions/logP-PieroProcacci-NES1-J.csv | 198 ++++++++++++++ .../logP/Analysis/Submissions/logP-dddc-1.csv | 173 ++++++++++++ .../logP/Analysis/Submissions/logP-dddc-2.csv | 172 ++++++++++++ .../Submissions/logP_AndrewPaluch_MD_1.csv | 167 ++++++++++++ .../Submissions/logP_AndrewPaluch_MD_2.csv | 169 ++++++++++++ .../logP_RodriguezPaluch_SM12_1.csv | 164 ++++++++++++ .../logP_RodriguezPaluch_SM8_1.csv | 164 ++++++++++++ .../logP_RodriguezPaluch_SM8_2.csv | 164 ++++++++++++ .../logP_RodriguezPaluch_SMD_1.csv | 163 +++++++++++ .../logP_RodriguezPaluch_SMD_2.csv | 163 +++++++++++ ...logP_prediction_Iorga_Beckstein_CGenFF.csv | 151 +++++++++++ .../logP_prediction_Iorga_Beckstein_GAFF.csv | 156 +++++++++++ ...P_prediction_Iorga_Beckstein_LigParGen.csv | 155 +++++++++++ ...ogP_prediction_Iorga_Beckstein_OPLS-AA.csv | 151 +++++++++++ .../Analysis/Submissions/logp-nhlbi-1.csv | 96 +++++++ .../Analysis/Submissions/logp-nhlbi-2.csv | 97 +++++++ .../logP/Analysis/Submissions/logp_DB1.csv | 214 +++++++++++++++ .../logP/Analysis/Submissions/logp_DB2.csv | 211 +++++++++++++++ .../logP/Analysis/Submissions/logp_DB3.csv | 214 +++++++++++++++ .../logP/Analysis/Submissions/logp_DB4.csv | 212 +++++++++++++++ .../Submissions/logp_ensemble_logp_model1.csv | 163 +++++++++++ .../Submissions/logp_ensemble_logp_model2.csv | 164 ++++++++++++ .../pKa/Analysis/SAMPL7-user-map-HG.csv | 10 + .../pKa/Analysis/Scripts/get_usermap.py | 16 ++ .../pKa/Analysis/Submissions/pKa-ECRISM-1.csv | 141 ++++++++++ .../Analysis/Submissions/pKa-IEFPCMMST-1.csv | 202 ++++++++++++++ .../Analysis/Submissions/pKa-RobertRaddi.csv | 194 ++++++++++++++ .../pKa/Analysis/Submissions/pKa-VA-2.csv | 86 ++++++ .../Submissions/pKa_RodriguezPaluch_SMD_1.csv | 152 +++++++++++ .../Submissions/pKa_RodriguezPaluch_SMD_2.csv | 146 ++++++++++ .../Submissions/pKa_RodriguezPaluch_SMD_3.csv | 143 ++++++++++ .../pKa_prediction_Iorga_Beckstein_1.csv | 198 ++++++++++++++ .../pKa/Analysis/Submissions/pka-nhlbi-1.csv | 141 ++++++++++ .../Submissions/pka-nhlbi-1_L0OUNi2.csv | 141 ++++++++++ .../Analysis/SAMPL7-user-map-HG.csv | 2 + .../Analysis/Scripts/get_usermap.py | 16 ++ .../Submissions/permeability-dddc-1.csv | 175 ++++++++++++ .../Submissions/permeability-dddc-2.csv | 175 ++++++++++++ 53 files changed, 7569 insertions(+) create mode 100644 physical_property/logP/Analysis/SAMPL7-user-map-HG.csv create mode 100644 physical_property/logP/Analysis/Scripts/get_usermap.py create mode 100644 physical_property/logP/Analysis/Submissions/LogP_chemprop_submission.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1_5l9kQR3.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-DavyGuan-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-ECRISM-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-EvrimArslan-6.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-FabioFalcioni-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-IEFPCMMST-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-JudithWarnauDassaultSystemes.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-MLRUCR-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-B.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-G.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-J.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-dddc-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP-dddc-2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM12_1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_CGenFF.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_GAFF.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_LigParGen.csv create mode 100644 physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_OPLS-AA.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp-nhlbi-1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp-nhlbi-2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_DB1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_DB2.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_DB3.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_DB4.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model1.csv create mode 100644 physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model2.csv create mode 100644 physical_property/pKa/Analysis/SAMPL7-user-map-HG.csv create mode 100644 physical_property/pKa/Analysis/Scripts/get_usermap.py create mode 100644 physical_property/pKa/Analysis/Submissions/pKa-ECRISM-1.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa-IEFPCMMST-1.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa-RobertRaddi.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa-VA-2.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_1.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_2.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_3.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pKa_prediction_Iorga_Beckstein_1.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pka-nhlbi-1.csv create mode 100644 physical_property/pKa/Analysis/Submissions/pka-nhlbi-1_L0OUNi2.csv create mode 100644 physical_property/permeability/Analysis/SAMPL7-user-map-HG.csv create mode 100644 physical_property/permeability/Analysis/Scripts/get_usermap.py create mode 100644 physical_property/permeability/Analysis/Submissions/permeability-dddc-1.csv create mode 100644 physical_property/permeability/Analysis/Submissions/permeability-dddc-2.csv diff --git a/physical_property/logP/Analysis/SAMPL7-user-map-HG.csv b/physical_property/logP/Analysis/SAMPL7-user-map-HG.csv new file mode 100644 index 00000000..322b1794 --- /dev/null +++ b/physical_property/logP/Analysis/SAMPL7-user-map-HG.csv @@ -0,0 +1,35 @@ +21,logP-dddc-1.csv +23,logP-dddc-2.csv +25,logP-JudithWarnauDassaultSystemes.csv +26,logP-ChrisLoschen-1.csv +27,logP-ChrisLoschen-2.csv +28,logP-FabioFalcioni-1.csv +29,logp_ensemble_logp_model1.csv +30,logp_ensemble_logp_model2.csv +31,logP-EvrimArslan-6.csv +32,logP-ChrisLoschen-1_5l9kQR3.csv +33,logp_DB1.csv +34,logp_DB2.csv +35,logp_DB3.csv +36,logp_DB4.csv +37,logP-IEFPCMMST-1.csv +38,logP-ECRISM-1.csv +39,logP_AndrewPaluch_MD_1.csv +40,logP_AndrewPaluch_MD_2.csv +41,logP-PieroProcacci-NES1-B.csv +42,logP-PieroProcacci-NES1-G.csv +43,logP-PieroProcacci-NES1-J.csv +44,logP-DavyGuan-1.csv +45,LogP_chemprop_submission.csv +46,logP_RodriguezPaluch_SM12_1.csv +47,logP_RodriguezPaluch_SM8_1.csv +48,logP_RodriguezPaluch_SM8_2.csv +49,logP_RodriguezPaluch_SMD_1.csv +50,logP_RodriguezPaluch_SMD_2.csv +51,logP-MLRUCR-1.csv +52,logp-nhlbi-1.csv +53,logp-nhlbi-2.csv +54,logP_prediction_Iorga_Beckstein_LigParGen.csv +55,logP_prediction_Iorga_Beckstein_CGenFF.csv +56,logP_prediction_Iorga_Beckstein_GAFF.csv +57,logP_prediction_Iorga_Beckstein_OPLS-AA.csv diff --git a/physical_property/logP/Analysis/Scripts/get_usermap.py b/physical_property/logP/Analysis/Scripts/get_usermap.py new file mode 100644 index 00000000..2e0812b5 --- /dev/null +++ b/physical_property/logP/Analysis/Scripts/get_usermap.py @@ -0,0 +1,16 @@ +#!/bin/env python + +outfile = '../SAMPL7-user-map-HG.csv' + +# Read user map from submission server +file = open('/Users/dmobley/github/SAMPL-submission-systems/SAMPL-submission-handling-shared/submissions/downloads/submission_table.txt', 'r') +text = file.readlines() +file.close() + +# Write output file, removing e-mail addresses +file = open(outfile, 'w') +for line in text: + tmp = line.split(',') + if 'LOGP' in tmp[2].upper(): + file.write(f'{tmp[0].strip()},{tmp[2].strip().replace(" ","_")}\n') +file.close() diff --git a/physical_property/logP/Analysis/Submissions/LogP_chemprop_submission.csv b/physical_property/logP/Analysis/Submissions/LogP_chemprop_submission.csv new file mode 100644 index 00000000..991bcffc --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/LogP_chemprop_submission.csv @@ -0,0 +1,165 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM33,SM33_micro000,-4.77,0.09,0.53 +SM42,SM42_micro000,-4.31,0.05,0.53 +SM30,SM30_micro000,-4.02,0.10,0.53 +SM34,SM34_micro000,-3.59,0.15,0.53 +SM43,SM43_micro000,-3.42,0.10,0.53 +SM25,SM25_micro000,-3.18,0.04,0.53 +SM45,SM45_micro000,-3.01,0.07,0.53 +SM31,SM31_micro000,-2.94,0.09,0.53 +SM39,SM39_micro000,-2.89,0.08,0.53 +SM36,SM36_micro000,-2.79,0.10,0.53 +SM32,SM32_micro000,-2.61,0.12,0.53 +SM27,SM27_micro000,-2.53,0.11,0.53 +SM41,SM41_micro000,-2.41,0.07,0.53 +SM46,SM46_micro000,-2.08,0.10,0.53 +SM29,SM29_micro000,-1.94,0.08,0.53 +SM40,SM40_micro000,-1.87,0.09,0.53 +SM37,SM37_micro000,-1.85,0.11,0.53 +SM26,SM26_micro000,-1.51,0.06,0.53 +SM28,SM28_micro000,-1.40,0.06,0.53 +SM44,SM44_micro000,-1.07,0.08,0.53 +SM38,SM38_micro000,-0.96,0.08,0.53 +SM35,SM35_micro000,-0.93,0.09,0.53 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bart Lenselink + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Galapagos + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +Chemprop + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.05, GPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +Linux workstation with an Intel(R) Xeon(R) W-2123 CPU & Quadro RTX 6000, training models took around a day, including a parameter search. (250 iterations) + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Chemprop (https://github.com/chemprop/chemprop , cloned on May 2020) +Pipeline pilot 17.2.0.1361 +ADMET Predictor 9.5 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +As a basis we used the logp dataset of the OPERA models (https://github.com/kmansouri/OPERA), accessed September 2020. +This dataset was processed and standardized in Pipeline pilot, we created a test set to test different models, tailored to the challenge (SAMPL_logp_1.xml); +All molecules with an ECFP_6 TC >0.25 compared with the challenge molecules, from the Opera set were flagged as test. (233 molecules) +The training set was created from the rest of the set, by subsequently filtering out all molecules with a ECFP_6 TC >0.4 to molecules found in the test set. +Several models were build using D-MPNN (https://github.com/chemprop/chemprop), focusing on: adding helper tasks (1), changing the parameters of the model(2) + +1: adding helper tasks: +We added different datasets that could be complementary in nature, as a separate task in the MT neural network: +LogP data from ChEMBL_26, LogD data from ChEMBL (AZ, doc id: CHEMBL3301361), in-house data. +Based on performance, both the ChEMBL_26, and AZ LogD data from ChEMBL were added. (all public data) +Finally we calculated logp, and LogD for all molecules using Simulations + ADMEpredictor, those predictions were added as additional tasks to the network (so 5 tasks in total) + +2: +Different parameters were explored using the native hyperopt script (250 iterations), and different ensemble sizes. +The final model was trained on all data, using an ensemble size of 10. +Predictions were done on basis of this ensemble, +TFE standard error of the mean (SEM) was estimated from the ensemble predictions. TFE model uncertainty was estimated from the RMSE on the test set (0.388*1.36333619568). +TFE was calculated from logP: logP *-1.36333619568 +#-RT*ln(10) +#-1.36333619568 = -1*(1.985877534*0.001)*298.15 *ln(10) + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True \ No newline at end of file diff --git a/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1.csv b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1.csv new file mode 100644 index 00000000..4614261f --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1.csv @@ -0,0 +1,118 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +Predictions: +SM25,SM25_micro000,-3.42,0.36,0.50 +SM26,SM26_micro000,-1.35,0.00,0.50 +SM27,SM27_micro000,-2.17,0.15,0.50 +SM28,SM28_micro000,-1.14,0.36,0.50 +SM29,SM29_micro000,-2.12,0.39,0.50 +SM30,SM30_micro000,-3.65,0.43,0.50 +SM31,SM31_micro000,-2.61,0.45,0.50 +SM32,SM32_micro000,-3.13,0.13,0.50 +SM33,SM33_micro000,-4.97,0.27,0.50 +SM34,SM34_micro000,-3.74,0.28,0.50 +SM35,SM35_micro000,-1.99,0.34,0.50 +SM36,SM36_micro000,-3.54,0.36,0.50 +SM37,SM37_micro000,-2.53,0.52,0.50 +SM38,SM38_micro000,-1.02,0.20,0.50 +SM39,SM39_micro000,-2.76,0.20,0.50 +SM40,SM40_micro000,-1.49,0.21,0.50 +SM41,SM41_micro000,-2.59,0.00,0.50 +SM42,SM42_micro000,-4.91,0.00,0.50 +SM43,SM43_micro000,-3.29,0.08,0.50 +SM44,SM44_micro000,-1.16,0.00,0.50 +SM45,SM45_micro000,-3.30,0.10,0.50 +SM46,SM46_micro000,-1.75,0.16,0.50 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Chris Loschen + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +not-organized/private + +# +# +# NAME SECTION +# +# The 'Name:' keyword is required as shown here. +Name: +ffsampled_deeplearning_cl1 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed on one GeForce GTX 1080 on a single linux machine. +Training of 100 epochs took about 1 hours. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Schnetpack 0.3 +Fastai 1.0.6 +RDKit 2020.03.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +A modified version of the deeplearning package schnetpack was used which is based on the work of Schütt et al.[1] and may be seen as a variant of message passing neural networks. However, the input does not use the chemical graph but is only 3D structure based and does not rely on any kind of precomputed descriptors, rather the molecular representations are learned on-the-fly during training. The neural net was trained with the fastai library, version 1 [2] using accelerated learning, so-called super-convergence as published by L. Smith et al.[3] and other tools available from the fastai library, which allow for fast iterations during testing. +A curated logP dataset was assembled mainly from the work of Mansouri et al.[4] and used for training, testing and validation. Input structures for the neural net have been generated from the provided SMILES via the distance geometry approach as implemented in the RDKIT and a quick conformational sampling was carried out using the MMFF94 forcefield. +Before the 3D structure generation molecules have been brought into a canonical representation with the RDKit. Statistical uncertainties were estimated based on the average of 10 distinct predictions runs and on the overall test sets performance. + +[1] Schutt, K. T., Kessel, P., Gastegger, M., Nicoli, K. A., Tkatchenko, A., & Müller, K. R. (2018). SchNetPack: A deep learning toolbox for atomistic systems. Journal of chemical theory and computation, 15(1), 448-455. +[2] https://fastai1.fast.ai/ +[3] Smith, L. N., & Topin, N. (2019, May). Super-convergence: Very fast training of neural networks using large learning rates. In Artificial Intelligence and Machine Learning for Multi-Domain Operations Applications (Vol. 11006, p. 1100612). International Society for Optics and Photonics. +[4] Mansouri, K., Grulke, C. M., Judson, R. S., & Williams, A. J. (2018). OPERA models for predicting physicochemical properties and environmental fate endpoints. Journal of cheminformatics, 10(1), 10.) + +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1_5l9kQR3.csv b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1_5l9kQR3.csv new file mode 100644 index 00000000..4614261f --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-1_5l9kQR3.csv @@ -0,0 +1,118 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +Predictions: +SM25,SM25_micro000,-3.42,0.36,0.50 +SM26,SM26_micro000,-1.35,0.00,0.50 +SM27,SM27_micro000,-2.17,0.15,0.50 +SM28,SM28_micro000,-1.14,0.36,0.50 +SM29,SM29_micro000,-2.12,0.39,0.50 +SM30,SM30_micro000,-3.65,0.43,0.50 +SM31,SM31_micro000,-2.61,0.45,0.50 +SM32,SM32_micro000,-3.13,0.13,0.50 +SM33,SM33_micro000,-4.97,0.27,0.50 +SM34,SM34_micro000,-3.74,0.28,0.50 +SM35,SM35_micro000,-1.99,0.34,0.50 +SM36,SM36_micro000,-3.54,0.36,0.50 +SM37,SM37_micro000,-2.53,0.52,0.50 +SM38,SM38_micro000,-1.02,0.20,0.50 +SM39,SM39_micro000,-2.76,0.20,0.50 +SM40,SM40_micro000,-1.49,0.21,0.50 +SM41,SM41_micro000,-2.59,0.00,0.50 +SM42,SM42_micro000,-4.91,0.00,0.50 +SM43,SM43_micro000,-3.29,0.08,0.50 +SM44,SM44_micro000,-1.16,0.00,0.50 +SM45,SM45_micro000,-3.30,0.10,0.50 +SM46,SM46_micro000,-1.75,0.16,0.50 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Chris Loschen + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +not-organized/private + +# +# +# NAME SECTION +# +# The 'Name:' keyword is required as shown here. +Name: +ffsampled_deeplearning_cl1 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed on one GeForce GTX 1080 on a single linux machine. +Training of 100 epochs took about 1 hours. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Schnetpack 0.3 +Fastai 1.0.6 +RDKit 2020.03.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +A modified version of the deeplearning package schnetpack was used which is based on the work of Schütt et al.[1] and may be seen as a variant of message passing neural networks. However, the input does not use the chemical graph but is only 3D structure based and does not rely on any kind of precomputed descriptors, rather the molecular representations are learned on-the-fly during training. The neural net was trained with the fastai library, version 1 [2] using accelerated learning, so-called super-convergence as published by L. Smith et al.[3] and other tools available from the fastai library, which allow for fast iterations during testing. +A curated logP dataset was assembled mainly from the work of Mansouri et al.[4] and used for training, testing and validation. Input structures for the neural net have been generated from the provided SMILES via the distance geometry approach as implemented in the RDKIT and a quick conformational sampling was carried out using the MMFF94 forcefield. +Before the 3D structure generation molecules have been brought into a canonical representation with the RDKit. Statistical uncertainties were estimated based on the average of 10 distinct predictions runs and on the overall test sets performance. + +[1] Schutt, K. T., Kessel, P., Gastegger, M., Nicoli, K. A., Tkatchenko, A., & Müller, K. R. (2018). SchNetPack: A deep learning toolbox for atomistic systems. Journal of chemical theory and computation, 15(1), 448-455. +[2] https://fastai1.fast.ai/ +[3] Smith, L. N., & Topin, N. (2019, May). Super-convergence: Very fast training of neural networks using large learning rates. In Artificial Intelligence and Machine Learning for Multi-Domain Operations Applications (Vol. 11006, p. 1100612). International Society for Optics and Photonics. +[4] Mansouri, K., Grulke, C. M., Judson, R. S., & Williams, A. J. (2018). OPERA models for predicting physicochemical properties and environmental fate endpoints. Journal of cheminformatics, 10(1), 10.) + +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-2.csv b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-2.csv new file mode 100644 index 00000000..96b26f8f --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-ChrisLoschen-2.csv @@ -0,0 +1,118 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +Predictions: +SM25,SM25_micro000,-4.29,0.27,0.49 +SM26,SM26_micro000,-1.22,0.00,0.49 +SM27,SM27_micro000,-2.14,0.06,0.49 +SM28,SM28_micro000,-1.23,0.34,0.49 +SM29,SM29_micro000,-2.37,0.07,0.49 +SM30,SM30_micro000,-4.23,0.16,0.49 +SM31,SM31_micro000,-2.95,0.24,0.49 +SM32,SM32_micro000,-3.23,0.05,0.49 +SM33,SM33_micro000,-5.19,0.18,0.49 +SM34,SM34_micro000,-3.87,0.21,0.49 +SM35,SM35_micro000,-1.81,0.32,0.49 +SM36,SM36_micro000,-3.72,0.24,0.49 +SM37,SM37_micro000,-2.36,0.34,0.49 +SM38,SM38_micro000,-1.36,0.16,0.49 +SM39,SM39_micro000,-3.20,0.08,0.49 +SM40,SM40_micro000,-1.57,0.35,0.49 +SM41,SM41_micro000,-2.75,0.00,0.49 +SM42,SM42_micro000,-5.03,0.00,0.49 +SM43,SM43_micro000,-3.39,0.20,0.49 +SM44,SM44_micro000,-1.24,0.00,0.49 +SM45,SM45_micro000,-3.27,0.06,0.49 +SM46,SM46_micro000,-1.71,0.07,0.49 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Chris Loschen + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +not-organized/private + +# +# +# NAME SECTION +# +# The 'Name:' keyword is required as shown here. +Name: +ffsampled_deeplearning_cl2 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed on one GeForce GTX 1080 on a single linux machine. +Training of 100 epochs took about 1 hours. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Schnetpack 0.3 +Fastai 1.0.6 +RDKit 2020.03.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +A modified version of the deeplearning package schnetpack was used which is based on the work of Schütt et al.[1] and may be seen as a variant of message passing neural networks. However, the input is only 3D structure based and does not rely on any kind of precomputed descriptors, rather the molecular representations are learned on-the-fly during training. The neural net was trained with the fastai library, version 1 [2] using accelerated learning, so-called super-convergence as published by L. Smith et al.[3] and other tools available from the fastai library, which allow for fast iterations during testing. +A curated logP dataset was assembled mainly from the work of Mansouri et al.[4] and used for training, testing and validation. Input structures for the neural net have been generated from the provided SMILES via the distance geometry approach as implemented in the RDKIT and a quick conformational sampling was carried out using the MMFF94 forcefield. +Before the 3D structure generation molecules have been brought into a canonical representation with the RDKit. Statistical uncertainties were estimated based on the average of 10 distinct predictions runs and on the overall test sets performance. + +[1] Schutt, K. T., Kessel, P., Gastegger, M., Nicoli, K. A., Tkatchenko, A., & Müller, K. R. (2018). SchNetPack: A deep learning toolbox for atomistic systems. Journal of chemical theory and computation, 15(1), 448-455. +[2] https://fastai1.fast.ai/ +[3] Smith, L. N., & Topin, N. (2019, May). Super-convergence: Very fast training of neural networks using large learning rates. In Artificial Intelligence and Machine Learning for Multi-Domain Operations Applications (Vol. 11006, p. 1100612). International Society for Optics and Photonics. +[4] Mansouri, K., Grulke, C. M., Judson, R. S., & Williams, A. J. (2018). OPERA models for predicting physicochemical properties and environmental fate endpoints. Journal of cheminformatics, 10(1), 10.) + +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP-DavyGuan-1.csv b/physical_property/logP/Analysis/Submissions/logP-DavyGuan-1.csv new file mode 100644 index 00000000..61b7c550 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-DavyGuan-1.csv @@ -0,0 +1,156 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,3.98,0.11,0.42 +SM26,SM26_micro000,1.53,0.02,0.42 +SM27,SM27_micro000,1.42,0.57,0.42 +SM28,SM28_micro000,0.99,0.44,0.42 +SM29,SM29_micro000,1.14,0.46,0.42 +SM30,SM30_micro000,3.59,0.15,0.42 +SM31,SM31_micro000,1.95,0.40,0.42 +SM32,SM32_micro000,2.15,0.51,0.42 +SM33,SM33_micro000,4.42,0.15,0.42 +SM34,SM34_micro000,2.87,0.08,0.42 +SM35,SM35_micro000,0.77,0.35,0.42 +SM36,SM36_micro000,2.63,0.25,0.42 +SM37,SM37_micro000,1.32,0.25,0.42 +SM38,SM38_micro000,0.62,0.43,0.42 +SM39,SM39_micro000,2.47,0.54,0.42 +SM40,SM40_micro000,1.04,0.52,0.42 +SM41,SM41_micro000,2.45,0.01,0.42 +SM42,SM42_micro000,4.28,0.23,0.42 +SM43,SM43_micro000,2.40,0.12,0.42 +SM44,SM44_micro000,1.65,0.34,0.42 +SM45,SM45_micro000,3.75,0.33,0.42 +SM46,SM46_micro000,2.31,0.18,0.42 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Davy Guan + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +The University of Sydney + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +RayLogP三_QSPR_Mordred2D_TPOT-AutoML + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +53.1 CPU hours + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All machine learning model training was conducted on a single machine hosting an Intel Core i7 5820K overclocked to 4.2Ghz. +Automated model searching, including hyperparameter tuning, model training, and cross-validation, for an initial population of 100 models over 10 generations was conducted in parallel on the six CPU cores. +The wall clock time taken to complete model searching was 11.7 hours, of which approximately 75% was spent doing parallel work and 25% conducting the initialisation of each generation in serial. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +TPOT Python library 0.10.2 +Mordred Python library 1.2.0 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Datasets: Structures from the Mansouri [1] and Martel [2] LogP datasets were combined to form an initial 14,707 chemical training dataset. Curation consisted of salt and solvent removal, adding explicit hydrogens, retaining only the heaviest fragments of mixtures, and neutralisation of any charged structures in ChemAxon Standardizer. An additional 15 compound dataset consisting of structures with sulfone functional groups and experimental LogP values was selected from the DrugBank database. The Mordred library [3] generated 2D descriptors for these datasets, along with the 22 compound SAMPL7 LogP Prediction Challenge molecules. +Modelling: The TPOT library [4] automated model searching and hyperparameter tuning using a genetic algorithm configured to optimise a population of 100 models for the lowest mean absolute error in LogP prediction over 10 generations. This model was validated on the held out 15 sulfone compound dataset with 0.42 RMSE. +TFE conversion from LogP to TFE: Standard state conditions were assumed in the conversion of predicted LogP values from the TPOT model to transfer free energies. This consisted of multiplying the predicted logP by the temperature (T=298K), gas constant (R=1.986x10^-3 kcal/K.mol) and 2.303 [5]. +Statistics: Statistical uncertainty was estimated by taking the standard deviation of the transfer free energies predicted by this model, and an earlier initial model trained with a population of 10 models for 10 generations used for prototyping. The standard error of the mean was calculated by dividing this standard deviation by the square root of 2. + +[1] Mansouri et al. (2018) J Cheminform 10: 10. +[2] Martel et al. (2013) Eur J Pharm Sci 48: 21-29. +[3] Moriwaki et al. (2018) J Cheminform 10: 4. +[4] Le et al. (2020) Bioinformatics 36: 250-256. +[5] Guan et al. (2020) J Comput Aided Mol Des 34: 511–522. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True \ No newline at end of file diff --git a/physical_property/logP/Analysis/Submissions/logP-ECRISM-1.csv b/physical_property/logP/Analysis/Submissions/logP-ECRISM-1.csv new file mode 100644 index 00000000..ddea258e --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-ECRISM-1.csv @@ -0,0 +1,101 @@ +Predictions: +SM25,SM25_micro000 SM25_micro002 SM25_micro004,-5.77,0.01,1.05,-0.34,0.01 +SM26,SM26_micro000 SM26_micro002 SM26_micro004,-3.26,0.01,1.05,-1.93,0.01 +SM27,SM27_micro000,-3.01,0.01,1.05,2.21,0.01 +SM28,SM28_micro000 SM28_micro001,-2.98,0.01,1.05,2.18,0.01 +SM29,SM29_micro000,-2.82,0.01,1.05,2.06,0.01 +SM30,SM30_micro000,-5.16,0.01,1.05,3.77,0.01 +SM31,SM31_micro000,-4.46,0.01,1.05,3.27,0.01 +SM32,SM32_micro000,-3.53,0.01,1.05,2.59,0.01 +SM33,SM33_micro000,-7.19,0.01,1.05,5.26,0.01 +SM34,SM34_micro000,-7.19,0.01,1.05,5.27,0.01 +SM35,SM35_micro000 SM35_micro002,-1.29,0.01,1.05,0.94,0.01 +SM36,SM36_micro000 SM36_micro002,-3.53,0.01,1.05,2.58,0.01 +SM37,SM37_micro000 SM37_micro003,-2.92,0.01,1.05,2.14,0.01 +SM38,SM38_micro000,-3.14,0.01,1.05,2.29,0.01 +SM39,SM39_micro000,-5.68,0.01,1.05,4.08,0.01 +SM40,SM40_micro000,-4.92,0.01,1.05,3.6,0.01 +SM41,SM41_micro000,-4.52,0.01,1.05,-0.54,0.01 +SM42,SM42_micro000 SM42_micro003,-8.54,0.01,1.05,2.09,0.01 +SM43,SM43_micro000 SM43_micro004,-5.83,0.01,1.05,2.13,0.01 +SM44,SM44_micro000,-2.21,0.01,1.05,-0.93,0.01 +SM45,SM45_micro000,-4.33,0.01,1.05,0.01,0.01 +SM46,SM46_micro000,-3.49,0.01,1.05,0.41,0.01 + +Participant name: +Stefan M. Kast, Nicolas Tielker + +Participant organization: +TU Dortmund University + +Name: +EC_RISM_wet + +Compute time: +174 hours, CPU + +Computing and hardware: +All calculations were conducted on the LiDO 3 high performance cluster of TU Dortmund University. Calculations were automatically scheduled and ran on either an Intel Xeon E5-4604v4 or an Intel Xeon E5-2640v4 CPU, depending on node availability. + +Software: +Corina 4.3.0 +Gaussian 09 Rev E.01 +Gaussian 16 Rev C.01 +3D RISM (inhouse development) +EC-RISM (inhouse development) +Python 3.6 +Anaconda2018.12 +Amber 12 +Mathematica 12.0 (Wolfram) + +Category: +Physical (QM) + +Method: +For microstates with multiple possible stereoisomers these were generated using Corina. +50 geometries, or 200 for molecules containing more than seven rotatable bonds, were generated for each microstate using the EmbedMultipleConfs function of RDKit. These structures were pre-optimized with Amber 12 using GAFF 1.7 parameters and AM1-BCC charges with an ALPB model to represent the dielectric environment of water and n-octanol, respectively. +Conformations with an energy of more than 20 kcal/mol than the minimum structure of that microstate were discarded and the remaining structures clustered with a structural RMSD of 0.5 Angstrom. The cluster representatives were then optimized using Gaussian 16revC01 with IEF-PCM using default settings for water or n-octanol respectively at the B3LYP/6-311+G(d,p) level of theory. +Additional stereoisomers were treated as if they were additional conformational states so that for each microsate only up to 5 conformations with the lowest PCM energies for each solvent were treated with EC-RISM/MP2/6-311+G(d,p) using the PSE2 closure for water and the PSE1 closure for n-octanol [REF1] and the resulting EC-RISM energies corrected using (c1*mu_{ex}+c2*PMV_{EC-RISM}) since only neutral species were considered for the calculation of the transfer free energy. The correction for water has a fixed parameter c1 = 1 since this additional parameter was found to be of no predicitve value in previous challenges and c2 = -0.1025 kcal*mol^-1*A^-3 while for octanol c1 = 1.2915 and c2 = -0.0141 kcal*mol^-1*A^-3 [REF2]. +The transfer free energy of a compound was then calculated by dG_trans=G_{oct}-G_{wat}, where G_{m} refers to the partition function estimate of the solvent specific free energy by summing over the conformational and tautomer states [REF3]. The log P can then be calculated by log P = dG_trans/(-RT*ln(10)). The log D was calculated at a pH of 7.4. +The SEM was estimated as the convergence criterion for a single EC-RISM calculation. The uncertainty was estimated as the RMSE of the n-octanol-water transfer free energy dataset contained within the Minnesota Solvation Database. + +References: +REF1: N. Tielker, D. Tomazic, J. Heil, T. Kloss, S. Ehrhart, S. Guessregen, K. F. Schmidt, S. M. Kast, J. Comput.-Aided Mol. Des. 30, 1035-1044 (2016). +REF2: N. Tielker, L. Eberlein, S. Guessregen, S. M. Kast, J. Comput.-Aided Mol. Des. 32, 1151-1163 (2018). +REF3: N. Tielker, D. Tomazic, L. Eberlein, S. Guessregen, S. M. Kast, J. Comput.-Aided Mol. Des. 34, 453-461 (2020). + +SM25_micro000, challenge provided SM25_micro000 SMILES string, 0% population +SM25_micro002, challenge provided SM25_micro002 SMILES string, 0% population +SM25_micro004, challenge provided SM25_micro004 SMILES string, 100% population +SM26_micro000, challenge provided SM26_micro000 SMILES string, 100% population +SM26_micro002, challenge provided SM26_micro002 SMILES string, 0% population +SM26_micro004, challenge provided SM26_micro004 SMILES string, 0% population +SM27_micro000, challenge provided SM27_micro000 SMILES string, 100% population +SM28_micro000, challenge provided SM28_micro000 SMILES string, 100% population +SM28_micro001, challenge provided SM28_micro001 SMILES string, 0% population +SM29_micro000, challenge provided SM29_micro000 SMILES string, 100% population +SM30_micro000, challenge provided SM30_micro000 SMILES string, 100% population +SM31_micro000, challenge provided SM31_micro000 SMILES string, 100% population +SM32_micro000, challenge provided SM32_micro000 SMILES string, 100% population +SM33_micro000, challenge provided SM33_micro000 SMILES string, 100% population +SM34_micro000, challenge provided SM34_micro000 SMILES string, 100% population +SM35_micro000, challenge provided SM35_micro000 SMILES string, 62% population +SM35_micro002, challenge provided SM35_micro002 SMILES string, 38% population +SM36_micro000, challenge provided SM36_micro000 SMILES string, 47% population +SM36_micro002, challenge provided SM36_micro002 SMILES string, 53% population +SM37_micro000, challenge provided SM37_micro000 SMILES string, 48% population +SM37_micro003, challenge provided SM37_micro003 SMILES string, 52% population +SM38_micro000, challenge provided SM38_micro000 SMILES string, 100% population +SM39_micro000, challenge provided SM39_micro000 SMILES string, 100% population +SM40_micro000, challenge provided SM40_micro000 SMILES string, 100% population +SM41_micro000, challenge provided SM41_micro000 SMILES string, 100% population +SM42_micro000, challenge provided SM42_micro000 SMILES string, 0% population +SM42_micro003, challenge provided SM42_micro003 SMILES string, 100% population +SM43_micro000, challenge provided SM43_micro000 SMILES string, 0% population +SM43_micro004, challenge provided SM43_micro004 SMILES string, 100% population +SM44_micro000, challenge provided SM44_micro000 SMILES string, 100% population +SM45_micro000, challenge provided SM45_micro000 SMILES string, 100% population +SM46_micro000, challenge provided SM46_micro000 SMILES string, 100% population + +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-EvrimArslan-6.csv b/physical_property/logP/Analysis/Submissions/logP-EvrimArslan-6.csv new file mode 100644 index 00000000..e690a80b --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-EvrimArslan-6.csv @@ -0,0 +1,60 @@ +#OCTANOL TO WATER (DG_octanol - DG_water) TRANSFER FREE ENERGY PREDICTIONS +Predictions: +SM25,SM25_micro000,-0.59,0.36,0.8 +SM26,SM26_micro000,0.75,0.36,0.8 +SM27,SM27_micro000,-0.67,0.36,0.8 +SM28,SM28_micro000,0.82,0.36,0.8 +SM29,SM29_micro000,0.7,0.36,0.8 +SM30,SM30_micro000,-1.29,0.36,0.8 +SM31,SM31_micro000,-0.79,0.36,0.8 +SM32,SM32_micro000,-1.69,0.36,0.8 +SM33,SM33_micro000,-3.48,0.36,0.8 +SM34,SM34_micro000,-1.99,0.36,0.8 +SM35,SM35_micro000,1.11,0.36,0.8 +SM36,SM36_micro000,-0.26,0.36,0.8 +SM37,SM37_micro000,2.21,0.36,0.8 +SM38,SM38_micro000,3.54,0.36,0.8 +SM39,SM39_micro000,0.59,0.36,0.8 +SM40,SM40_micro000,2.61,0.36,0.8 +SM41,SM41_micro000,1.41,0.36,0.8 +SM42,SM42_micro000,-0.33,0.36,0.8 +SM43,SM43_micro000,0.27,0.36,0.8 +SM44,SM44_micro000,2.23,0.36,0.8 +SM45,SM45_micro000,0.68,0.36,0.8 +SM46,SM46_micro000,2.46,0.36,0.8 + +Participant name: +Evrim Arslan + +Participant organization: +Bogazici University + +Name: +TFE-prediction-solvation-b3lypd3 + +Compute time: +28 hours CPU + +Computing and hardware: +All the simulations were performed with GPU01 and curie clusters +GPU01 intel Xeon E5-2697A /32 Core-NVIDIA GK210GL [Tesla K80] x 4 G4GB RAM +CURIE intel Xeon E7-4870 v2s/120 core-512GB RAM + +Software: +Spartan14 V1.1.0 +GaussView 6.0.16 +Gaussian16-RevA.03 + +Category: +Physical (QM) + +Method: +All possible conformations of the ligands are located with the semi-empirical PM3 method by using the SPARTAN software. Free rotations around single bonds are taken into account and all the geometries corresponding to stationary points are re-optimised with the Gaussian software package by density functional theory (DFT) using the B3LYP-D3 functional and the 6-311+G(d,,p) basis set. Solvation Model based on Density (SMD) method at 298.15K is used for the optimizations of the conformations in water (e=78.36) and octanol (e=10.30). Equation 1 is used to calculate the log(KOW) value of the molecules corresponding to the global minima +log(KOW)=[DG_solv(water)-DG_solv(octanol)]/2.303RT (Equation 1) +#For the calculation of standard error of mean (SEM) standard deviations (stdev) are calculated according to the Equation 2 where value is the TFE (transfer free energy) of the each molecule, mean is the mean of TFEs of the molecules of interest and N is the size of the population +stdev = sqrt((1/(N-1))*(sum of (value-mean)^2)) (Equation 2) +After calculation of standard deviations standard error of mean (SEM) values are calculated according to the Equation 3 +#SEM=stdev/sqrt(N) (Equation 3) + +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-FabioFalcioni-1.csv b/physical_property/logP/Analysis/Submissions/logP-FabioFalcioni-1.csv new file mode 100644 index 00000000..db7c4333 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-FabioFalcioni-1.csv @@ -0,0 +1,129 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty + +Predictions: +# Example of a submission that used only challenge provided neutral microstate (`SMXX_micro000` from the provided input files) +SM25,SM25_micro000,-3.74,0.91,1.3 +SM26,SM26_micro000,-0.12,0.30,1.3 +SM27,SM27_micro000,-1.81,1.11,1.3 +SM28,SM28_micro000,1.29,0.77,1.3 +SM29,SM29_micro000,-1.62,2.63,1.3 +SM30,SM30_micro000,-3.02,1.43,1.3 +SM31,SM31_micro000,-3.06,0.35,1.3 +SM32,SM32_micro000,0.10,1.26,1.3 +SM33,SM33_micro000,-6.24,0.41,1.3 +SM34,SM34_micro000,-4.15,2.23,1.3 +SM35,SM35_micro000,-0.55,1.66,1.3 +SM36,SM36_micro000,-3.77,1.71,1.3 +SM37,SM37_micro000,-2.73,2.20,1.3 +SM38,SM38_micro000,-4.95,1.97,1.3 +SM39,SM39_micro000,-5.36,2.69,1.3 +SM40,SM40_micro000,-2.15,1.29,1.3 +SM41,SM41_micro000,-4.60,2.16,1.3 +SM42,SM42_micro000,-8.46,1.42,1.3 +SM43,SM43_micro000,-4.88,1.44,1.3 +SM44,SM44_micro000,-7.11,0.56,1.3 +SM45,SM45_micro000,-7.70,0.52,1.3 +SM46,SM46_micro000,-4.31,1.25,1.3 + +# +# +Participant name: +Fabio Falcioni +Jas Kalayan +Richard Henchman + +# +# +Participant organization: +University of Manchester + +# +# NAME SECTION +Name: +MD-EE-MCC (GAFF-TIP4P-Ew) + +# +# +# COMPUTE TIME SECTION +# +# The 'Compute time:' keyword is required as shown here. +Compute time: +7 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +All the simulations were performed on a single Nvidia V100-SXM2_16gb GPU hosting a node of 2x16 Intel Xeon Gold 6130 CPUs at the "Computational Shared Facility" at the University of Manchester. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Amber2018 +AmberTools18 +Poseidon beta + +# METHOD CATEGORY SECTION + +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# + +Method: + +Simulation setup was done using the AMBER 18 MD simulations package. The molecules pdb files were obtained through SMILES strings available in the challenge GitHub repository and the molecular building software Avogadro. The TIP4P-EW force field for water and GAFF force field with AM1-BCC charges for octanol and solutes with parameters generated using AmberTools 18. Topology files were obtained through antechamber and the built-in tool LEaP. Boxes of side 34 Angstroms were created using Packmol for both pure solvents and solutions. Octanol boxes contain 150 molecules and 1 solute molecule, while water boxes contain 1300 water molecules and 1 solute molecule. + +For all simulations a 5000 steepest descent minimisation of the system followed by a 200 ps heating phase of molecular dynamics (MD) simulation to 298 K was employed. Both equilibrium and production MD simulations were run in the NPT ensemble with target pressure 1 bar using the Berendsen barostat and relaxation time constant tauP = 2.0ps and a simulation time step of 2 fs using the SHAKE algorithm for bond lenght constraints involving hydrogens. Temperature was controlled using Langevin dynamics with a collision frequency gamma= 5.0 ps-1. Non-bonded interactions were calculated with a cut-off distance of 10.0 Angstroms and Particle-Mesh Ewald for long-range electrostatic interactions. Systems were equilibrated for 25 ns and data was collected for 100 ns. + +Free energies are evaluated using the energy-entropy (EE) method whereby G = H - TS. Energy is calculated directly from the simulation and entropy is calculated using Multiscale Cell Correlation (MCC). MCC uses the forces and coordinates generated in the MD simualtion.[1,2,3] Entropy S of a system is calculated as: + S_liquid = N*S_solvent + S_solution = N*S_solvent + S_solute +where S_solvent is the entropy of a single molecule and N is the number of molecules. +For partition coefficients, simulations are run of both pure water and octanol and of solutions of each solute in water and octanol. +The transfer free energy of the solute into solvent i equals + ∆G_i = G_solution − G_liquid +The final free energy difference for transfer from water to octanol is + ∆G_transfer= ∆G_octanol - ∆G_water + +Each simulation was done in triplicate to assess the statistical uncertainty of the model, yielding a Standard Error of the Mean (SEM) calculated as + SEM=s/(sqrt(n)) +where s is the standard deviation and n the number of repetitions. Model uncertainty is 1.3 kcal/mol based on the root-mean squared error of the energy due to GAFF as found in literature.[4] + +Citations +[1]. R. H. Henchman, J. Chem. Phys., 2007, 126, 064504. +[2]. J. Higham, S. Chou, F. Grater, R. H. Henchman, Mol. Phys., 2018, 116, 1965. +[3]. H. S. Ali, J. Higham, R. H. Henchman, Entropy, 2020, 21, 750. +[4]. J. Wang, R. Wolf, J. Caldwell, P. Kollman and D. Case, J. of Comput. Chem., 2004, 25, 1157-1174. +# +# +# All submissions must either be ranked or non-ranked. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-IEFPCMMST-1.csv b/physical_property/logP/Analysis/Submissions/logP-IEFPCMMST-1.csv new file mode 100644 index 00000000..da54e7f3 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-IEFPCMMST-1.csv @@ -0,0 +1,143 @@ + +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-2.57,0.00,1.06 +SM26,SM26_micro000,0.28,0.00,1.06 +SM27,SM27_micro000,-2.39,0.00,1.06 +SM28,SM28_micro000,-1.13,0.00,1.06 +SM29,SM29_micro000,-1.68,0.00,1.06 +SM30,SM30_micro000,-4.81,0.00,1.06 +SM31,SM31_micro000,-2.20,0.00,1.06 +SM32,SM32_micro000,-2.23,0.00,1.06 +SM33,SM33_micro000,-5.83,0.00,1.06 +SM34,SM34_micro000,-3.26,0.00,1.06 +SM35,SM35_micro000,-1.05,0.00,1.06 +SM36,SM36_micro000,-5.10,0.00,1.06 +SM37,SM37_micro000,-2.56,0.00,1.06 +SM38,SM38_micro000,-0.65,0.00,1.06 +SM39,SM39_micro000,-3.37,0.00,1.06 +SM40,SM40_micro000,-1.94,0.00,1.06 +SM41,SM41_micro000,-1.19,0.00,1.06 +SM42,SM42_micro000,-5.10,0.00,1.06 +SM43,SM43_micro000,-2.52,0.00,1.06 +SM44,SM44_micro000,0.22,0.00,1.06 +SM45,SM45_micro000,-2.77,0.00,1.06 +SM46,SM46_micro000,-1.29,0.00,1.06 + + +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +CBDD Group + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Barcelona and University of Costa Rica. + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-prediction-method-IEFPCM/MST + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +28 hours, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The conformational ensembles were built in the RPBS Web Portal (https://mobyle.rpbs.univ-paris-diderot.fr/cgi-bin/portal.py#forms::Frog2) using the FRee Online druG conformation generation (Frog2). Quamtum mechanics computations were run on the Consorci de Serveis Universitaris de Catalunya (CSUC). + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Software Frog 2.14 +Software Gaussian 16 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +The IEFPCM/MST method is a quantum mechanical (QM) self-consistent continuum solvation method that have proved to be a cost-effective approach for the calculation of solvation free energies in differents solvents. From the computation of the solvation free energies in water and n-octanol the partition coefficient is determined. In the Miertus−Scrocco−Tomasi model, the solvation free energy is calculated by adding nonelectrostatic and electrostatic contributions, which are calculated using a double molecule-shaped cavity for the solute embedded in the polarizable continuum medium.The IEFPCM/MST method used to estimate log P, can be subdivided into two parts. In the first step, we use the Frog 2.14 software, that helped us to know the conformational preferences of the set of 22 molecules. In this case, due to their structure and complexity, we limited the number of generated conformations to 15. Once reviewed and inspected the correct study of the conformational space as well as eliminating the redundant conformations, the second part of the method is based on QM calculations, using the Gaussian 16 package. Then, the conformations obtained with Frog 2.14 were optimized using MST method, both in water and in octanol, at B3LYP / 6-31G (d) theory level, checking the frequencies of all of them. After this step, those conformations with negative frequencies were ignored for the subsequent steps. Then, single point calculations were performed in gas, water and octanol, all of them at the B3LYP/6-31G(d) level of theory. All single point calculations were used to estimate the Solvation Free Energy for each of the conformations. Moreover, Thermal Correction to Gibbs Free Energy was also added to all of them. The total energy of each conformer was weighted using the Boltzmann approximation. Finally, an average energy for the octanol and water structure was obtained that permit us obtaining the difference dG Octanol-dG Water and consequently its log P. + +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-JudithWarnauDassaultSystemes.csv b/physical_property/logP/Analysis/Submissions/logP-JudithWarnauDassaultSystemes.csv new file mode 100644 index 00000000..7528e015 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-JudithWarnauDassaultSystemes.csv @@ -0,0 +1,195 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# +# The energy units must be in kcal/mol. +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`.,,, +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +# +# +Predictions: +SM25,SM25_extra001,-5.30,0.1,0.6 +SM26,SM26_micro000,-2.78,0.1,0.6 +SM27, SM27_micro000,-3.45,0.1,0.6 +SM28, SM28_micro000,-2.00,0.1,0.6 +SM29, SM29_micro000,-2.93,0.1,0.6 +SM30, SM30_micro000,-4.72,0.1,0.6 +SM31, SM31_micro000,-4.48,0.1,0.6 +SM32, SM32_micro000,-4.40,0.1,0.6 +SM33, SM33_micro000,-6.91,0.1,0.6 +SM34, SM34_micro000,-6.11,0.1,0.6 +SM35, SM35_micro002 SM35_micro000,-1.99,0.1,0.6 +SM36, SM36_micro002 SM36_micro000,-4.26,0.1,0.6 +SM37, SM37_micro000 SM37_micro003,-3.46,0.1,0.6 +SM38, SM38_micro000,-2.07,0.1,0.6 +SM39, SM39_micro000,-3.65,0.1,0.6 +SM40, SM40_micro000,-3.69,0.1,0.6 +SM41, SM41_micro000 SM41_extra001,-3.13,0.1,0.6 +SM42, SM42_extra001 SM42_micro000,-5.87,0.1,0.6 +SM43, SM43_extra001 SM43_micro000,-4.66,0.1,0.6 +SM44, SM44_micro000 SM44_extra001,-1.75,0.1,0.6 +SM45, SM45_micro000 SM45_extra001,-4.07,0.1,0.6 +SM46, SM46_micro000 SM46_extra001,-3.12,0.1,0.6 + +# +# +# Please list your name," using only UTF-8 characters as described above. The ""Participant name:"" entry is required." +Participant name: +Michael Diedenhofen, Frank Eckert, Jens Reinisch, Johannes Schwoebel, Judith Warnau, Karin Wichmann + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Dassault Systemes Deutschland GmbH, BIOVIA + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used.,,,, +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +COSMO-RS + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +216 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The average compute time across all molecule was estimated by taking the conformationally most demanding molecule SM36 and conducting a conformer search on standard Think Pad Laptop (Intel Core i7-985H CPU 2.6 GHz, 32GB RAM). +The estimated time on this hardware is 9 CPU-days per molecule including all considered microstates. The average time per microstate is 3 CPU-days. +In total, we performed more than 8500 BP-TZVP-COSMO geometry optimizations on various standard CPUs. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +BIOVIA COSMOquick 2020: A proprietary tool of Dassault Systemes to generate tautomeric states. +BIOVIA COSMOconf 2020: A proprietary tool of Dassault Systemes that applies conformational search using force fields, TURBOMOLE and COSMOtherm. +BIOVIA COSMOtherm 2020: The proprietary software developed and distributed by Dassault Systemes, which uses COSMO-RS. COSMO-RS is a published theory: Klamt A (1995) Conductor-like screening model for real solvents: a new approach to the quantitative calculation of solvation phenomena. J Phys Chem 99:2224-2235. https://doi.org/10.1021/j100007a062 +TURBOMOLE 7.5: The quantum chemistry suite. University of Karlsruhe and Forschungszentrum Karlsruhe GmbH, 1989-2007, TURBOMOLE GmbH, since 2007; available from http://www.turbomole.com: Karlsruhe, Germany, 2020. + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Workflow +1) We used COSMOquick to generate possible tautomeric states. Irrelevant states where discarded due to an internal energy threshold implemented in COSMOquick. +By a subsequent manual check, we sorted out destroyed molecules and failed calculations. We conducted a conformational search of every microstate with COSMOconf using up to 150 conformers. +2) For each conformer we performed a geometry optimization using the BP86 functional with a TZVP basis set and the COSMO solvation scheme, +followed by a single point energy calculation using the BP86 functional with a def2-TZVPD basis set and the FINE COSMO cavity. +All density functional theory calculations were carried out with the TURBOMOLE 7.5 program package. +We manually checked the outcome for destroyed molecules and failed calculations. +3) A conformer selection was done by applying COSMOconf (using internally COSMOtherm) to reduce the number of conformers and tautomers for the neutral molecule sets. +The final set of the neutral state contains only those conformers and states that are relevant in liquid solutions. +This set may contain various microstates, if they have not been identified as irrelevant in one of the steps before. +4) The COSMOtherm software, version 2020, was used to calculate the free energy difference for each molecule set from step 2 and to calculate the relative weight of the microstates in water. +All free energy calculations were carried out using the BP-TZVPD-FINE 20 level of COSMO-RS in COSMOtherm. +Within the used COSMO-RS an ensemble of conformers and microstates is automatically used and weighted according to the total free energy in the respective liquid phase, i.e. different weights are used in water and octanol. + +The applied COSMO-RS version BP-TZVPD-FINE 20 shows a general root mean square deviation (RMSD) of 0.6 kcal/mol with regard to experimental octanol/water partitioning data. +COSMO-RS algorithm by itself has no statistical error. The overall workflow including the conformational search (or molecule or state) has a statistical noise smaller than 0.1 kcal/mol. + +SM25_extra001, c1ccc(cc1)CCC(=O)(NS(=O)(=O)c2ccccc2),100% population +SM26_micro000, CC(=O)NS(=O)(=O)CCc1ccccc1,100% population +SM27_micro000, CC1(COC1)NS(=O)(=O)CCc2ccccc2,100% population +SM28_micro000, CC(=O)NC1(CS(=O)(=O)C1)CCc2ccccc2,100% population +SM29_micro000, CS(=O)(=O)NC1(COC1)CCc2ccccc2,100% population +SM30_micro000, c1ccc(cc1)CCC2(COC2)NS(=O)(=O)c3ccccc3,100% population +SM31_micro000, CN(C)S(=O)(=O)NC1(COC1)CCc2ccccc2,100% population +SM32_micro000, CS(=O)(=O)NC1(CSC1)CCc2ccccc2,100% population +SM33_micro000, c1ccc(cc1)CCC2(CSC2)NS(=O)(=O)c3ccccc3,100% population +SM34_micro000, CN(C)S(=O)(=O)NC1(CSC1)CCc2ccccc2,100% population +SM35_micro002, CS(=O)(=O)N[C@]1(CCC2=CC=CC=C2)C[S@](=O)C1,57% population +SM35_micro000, CS(=O)(=O)N[C@]1(CCc2ccccc2)CS(=O)C1,43% population +SM36_micro002, O=[S@]1C[C@](CCC2=CC=CC=C2)(C1)NS(=O)(=O)C1=CC=CC=C1,95% population +SM36_micro000, O=S1C[C@](CCc2ccccc2)(C1)NS(=O)(=O)c1ccccc1,5% population +SM37_micro000, CN(C)S(=O)(=O)N[C@]1(CCc2ccccc2)CS(=O)C1,54% population +SM37_micro003, CN(C)S(=O)(=O)N[C@]1(CCC2=CC=CC=C2)C[S@](=O)C1,46% population +SM38_micro000, CS(=O)(=O)NC1(CS(=O)(=O)C1)CCc2ccccc2,100% population +SM39_micro000, c1ccc(cc1)CCC2(CS(=O)(=O)C2)NS(=O)(=O)c3ccccc3,100% population +SM40_micro000, CN(C)S(=O)(=O)NC1(CS(=O)(=O)C1)CCc2ccccc2,100% population +SM41_micro000, CS(=O)(=O)Nc1cc(on1)c2ccccc2,100% population +SM41_extra001, CS(=O)(=O)N=C1[NH+]OC(=C1)c1ccccc1,0% population +SM42_extra001, O=S(=O)(Nc1cc(on1)-c1ccccc1)c1ccccc1,100% population +SM42_micro000, O=S(=O)(N=c1cc(o[nH]1)-c1ccccc1)c1ccccc1,0% population +SM43_extra001, CN(C)S(=O)(=O)Nc1cc(on1)-c1ccccc1,100% population +SM43_micro000, CN(C)S(=O)(=O)N=c1cc(o[nH]1)c2ccccc2,0% population +SM44_micro000, CS(=O)(=O)Nc1cn(nn1)c2ccccc2,100% population +SM44_extra001, CS(=O)(=O)[N-]C1=CN(N=[NH+]1)c1ccccc1,0% population +SM45_micro000, c1ccc(cc1)n2cc(nn2)NS(=O)(=O)c3ccccc3,98% population +SM45_extra001, O=S(=O)([N-]C1=CN(N=[NH+]1)C1=CC=CC=C1)C1=CC=CC=C1,2% population +SM46_micro000, CN(C)S(=O)(=O)Nc1cn(nn1)c2ccccc2,100% population +SM46_extra001, CN(C)S(=O)(=O)[N-]C1=CN(N=[NH+]1)c1ccccc1,0% population + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The ""Ranked:"" keyword is required +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-MLRUCR-1.csv b/physical_property/logP/Analysis/Submissions/logP-MLRUCR-1.csv new file mode 100644 index 00000000..a1ee0d93 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-MLRUCR-1.csv @@ -0,0 +1,144 @@ + +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-3.20,1.05,1.36 +SM26,SM26_micro000,-1.61,1.05,1.36 +SM27,SM27_micro000,-2.00,1.05,1.36 +SM28,SM28_micro000,-2.55,1.05,1.36 +SM29,SM29_micro000,-2.00,1.05,1.36 +SM30,SM30_micro000,-3.73,1.05,1.36 +SM31,SM31_micro000,-2.10,1.05,1.36 +SM32,SM32_micro000,-2.69,1.05,1.36 +SM33,SM33_micro000,-4.42,1.05,1.36 +SM34,SM34_micro000,-2.79,1.05,1.36 +SM35,SM35_micro000,-1.86,1.05,1.36 +SM36,SM36_micro000,-3.59,1.05,1.36 +SM37,SM37_micro000,-1.97,1.05,1.36 +SM38,SM38_micro000,-1.27,1.05,1.36 +SM39,SM39_micro000,-3.01,1.05,1.36 +SM40,SM40_micro000,-1.38,1.05,1.36 +SM41,SM41_micro000,-1.97,1.05,1.36 +SM42,SM42_micro000,-2.14,1.05,1.36 +SM43,SM43_micro000,-0.52,1.05,1.36 +SM44,SM44_micro000,-1.89,1.05,1.36 +SM45,SM45_micro000,-3.62,1.05,1.36 +SM46,SM46_micro000,-1.99,1.05,1.36 + + +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Computational Biophysics, Biology and Bioinformatics Group (CB3 Group) + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Costa Rica + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-prediction-method-MLR + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +30 min, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The Multiple Linear Regression (MLR) was built using R packages for data science, also, in-house scripts for taking into account the frequency of functional groups. All computations were performed in desktop computers belonging to the research group. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +ChemmineR +ChemmieOB +Obabel + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +A Multiple Linear Regression (MLR) was made from experimental logP values from 60 sulfonamides, obtained from PubChem and DrugBank. The dataset contained mainly sulfonamide drugs and smaller molecules with other classsical functional groups. In order to crate the MLR, the following descriptors were used: the frenquency of functional groups, hydrogen bond acceptors (HBA), hydrogen bond donors (HBD), molar refractivity (MR) and Topological polar surface area (PSA). The functional groups frequency was calculated with a in-house script from a modified function of OpenBabel, the rest was obtained from supplied OpenBabel properties. + +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-B.csv b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-B.csv new file mode 100644 index 00000000..b4246bd6 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-B.csv @@ -0,0 +1,252 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +# +# N.B. the third field is (DG_oct - DG_wat) where DG_oct/wat are the solvation free energies. +# The o/w partition coefficient is defined as LogP = - TFE/(RT log(10)) +# +Predictions: +SM25,SM25_micro000,-2.40,0.08,2 +SM26,SM26_micro000, 0.58,0.10,2 +SM27,SM27_micro000,-1.96,0.17,2 +SM28,SM28_micro000,-1.89,0.13,2 +SM29,SM29_micro000,-1.67,0.10,2 +SM30,SM30_micro000,-4.84,0.19,2 +SM31,SM31_micro000,-4.26,0.15,2 +SM32,SM32_micro000,-3.15,0.09,2 +SM33,SM33_micro000,-6.32,0.19,2 +SM34,SM34_micro000,-4.99,0.14,2 +SM35,SM35_micro000, 0.10,0.22,2 +SM36,SM36_micro000,-1.22,0.20,2 +SM37,SM37_micro000,-1.51,0.10,2 +SM38,SM38_micro000, 0.48,0.13,2 +SM39,SM39_micro000,-2.75,0.29,2 +SM40,SM40_micro000,-0.84,0.15,2 +SM41,SM41_micro000,-3.69,0.09,2 +SM42,SM42_micro000,-6.19,0.17,2 +SM43,SM43_micro000,-5.69,0.10,2 +SM44,SM44_micro000,-3.40,0.06,2 +SM45,SM45_micro000,-5.63,0.18,2 +SM46,SM46_micro000,-5.07,0.11,2 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Piero Procacci + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Universita' di Firenze + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +NES-1 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +12000 core-hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +The HREM stage (gas-phase) for collecting the initial states with the +ghost solute in the solvent was done on a local Intel i7 quadcore +workstation in about 15 minutes on per molecule + +All NES-1 computations were done on the 24K cores CRESCO6 ENEA cluster +equipped with Intel Skylake 48 cores CPU 2.4 GHz. + +For each molecule in water and octanol we launched 480 trajectories (each with 6 NTHREADS) for a total of 2880 cores +per solvation/hydration free energy. + +The wall clock time was 36 minutes for each solvation/hydration free energy calculations. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +ORAC 6.0.1 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +Unidirectional nonequilibrium switching (NES-1) (see +doi:10.1007/s10822-019-00233-9) + +HREM stage +In NES-1, for both w and o solvent, only the compound configurations +at full decoupling (lambda=0, gas-phase) are canonically sampled using +Hamiltonian Replica Exchange with Torsional Tempering (HREM-TT) (doi: +10.1002/jcc.21388). Gas-phase HREM-TT (8 ns on the target state and +eight replicas with min. scaling factor of 0.1 [ 3000 K] on a single +molecule in an empty box ) are done by scaling along the replica +progression only the torsional potentials (including the 14 non bonded +interactions). + +NS stage +The starting configurations at lambda=0 (no coupling) for the fast +growth (NE) process are prepared by combining 480 gas-phase H-REM +sampled configurations with few snapshots taken from NPT equilibrated +runs of pure solvents. In water and in 1-octanol the nonequilibrium +swicthing (NES) alchemical switch-on (fast growth) time was 300 ps in +both solvents. LJ interaction were first turned on in the first 240 +ps and the molecule was recharged in the following 60 ps. LJ work and +QQ were saved for each trajectory and checked for correlation (see +below). + + +Post-processing stage + +The correlation coefficient and Kendall rank coefficient for the LJ +and Q work are reported below for each system +# system R^2 tau ndeg +SM25_o_300_G.lwrk -0.06 -0.02 480 +SM25_w_300_G.lwrk 0.00 0.01 480 +SM26_o_300_G.lwrk 0.04 0.02 480 +SM26_w_300_G.lwrk 0.01 -0.00 480 +SM27_o_300_G.lwrk 0.00 -0.01 480 +SM27_w_300_G.lwrk 0.03 0.00 480 +SM28_o_300_G.lwrk -0.07 -0.05 480 +SM28_w_300_G.lwrk 0.10 0.06 480 +SM29_o_300_G.lwrk -0.01 0.00 480 +SM29_w_300_G.lwrk 0.01 0.00 480 +SM30_o_300_G.lwrk -0.05 -0.02 480 +SM30_w_300_G.lwrk 0.16 0.11 480 +SM31_o_300_G.lwrk -0.11 -0.06 480 +SM31_w_300_G.lwrk -0.07 -0.06 480 +SM32_o_300_G.lwrk -0.06 -0.05 480 +SM32_w_300_G.lwrk 0.04 0.03 480 +SM33_o_300_G.lwrk -0.01 0.03 480 +SM33_w_300_G.lwrk -0.03 -0.02 480 +SM34_o_300_G.lwrk -0.04 -0.02 480 +SM34_w_300_G.lwrk 0.06 0.05 480 +SM35_o_300_G.lwrk -0.06 -0.03 480 +SM35_w_300_G.lwrk 0.05 0.03 480 +SM36_o_300_G.lwrk 0.01 0.02 480 +SM36_w_300_G.lwrk 0.05 0.02 480 +SM37_o_300_G.lwrk -0.09 -0.08 480 +SM37_w_300_G.lwrk 0.07 0.05 480 +SM38_o_300_G.lwrk -0.01 -0.01 480 +SM38_w_300_G.lwrk 0.03 0.01 480 +SM39_o_300_G.lwrk -0.14 -0.08 480 +SM39_w_300_G.lwrk -0.03 -0.02 480 +SM40_o_300_G.lwrk -0.09 -0.04 480 +SM40_w_300_G.lwrk 0.05 0.03 480 +SM41_o_300_G.lwrk 0.02 0.02 480 +SM41_w_300_G.lwrk -0.02 -0.02 480 +SM42_o_300_G.lwrk -0.09 -0.05 480 +SM42_w_300_G.lwrk -0.01 -0.02 480 +SM43_o_300_G.lwrk -0.05 -0.03 480 +SM43_w_300_G.lwrk -0.05 -0.05 480 +SM44_o_300_G.lwrk 0.02 0.02 480 +SM44_w_300_G.lwrk -0.00 0.00 480 +SM45_o_300_G.lwrk 0.06 0.04 480 +SM45_w_300_G.lwrk -0.01 -0.01 480 +SM46_o_300_G.lwrk 0.08 0.04 480 +SM46_w_300_G.lwrk -0.04 -0.03 480 +The solvation free energies are recovered using the Jarzynski estimate +by combining the two independent RV's, LJ work and the electrostatic +work, and by forming the convolution work distribution P(W)=\sum_i P_lj +(W_i)P_{q}(W-W_i) (see doi.org/10.1063/1.5120616 ). The NES +simulations were performed under Periodic Boundary Conditions (mean +box size 40 nm^3 circa in all cases) in the NPT ensemble (standard +conditions, Parrinello Rahman isotropic stress, Nose' thermostats ) +using PME-Ewald (alpha=0.37 and 1 Angs grid spacing in the direct +space) and a LJ cut off of 1.3 nm. Dry ocatonol was used in all +cases. + +Force field section. +The force field is GAFF2 (generalized Amber Force Field, version 2, +www.amber.org ). The FF parameters (bonded, charges and LJ) and +topology of the host and guests molecules were prepared using the +PrimaDORAC interface ( doi: 10.1021/acs.jcim.7b00145). The water model +is OPC3 ( doi: 10.1063/1.4960175.). 1-octanol parameterization was +done with PrimaDORAC. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-G.csv b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-G.csv new file mode 100644 index 00000000..33e090ed --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-G.csv @@ -0,0 +1,200 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +# +# N.B. the third field is (DG_oct - DG_wat) where DG_oct/wat are the solvation free energies. +# The o/w partition coefficient is defined as LogP = - TFE/(RT log(10)) +# +Predictions: +SM25,SM25_micro000,-2.00,0.77,2 +SM26,SM26_micro000, 0.33,0.74,2 +SM27,SM27_micro000,-2.21,0.77,2 +SM28,SM28_micro000,-0.51,0.98,2 +SM29,SM29_micro000,-1.70,0.77,2 +SM30,SM30_micro000,-3.48,0.67,2 +SM31,SM31_micro000,-3.44,0.58,2 +SM32,SM32_micro000,-2.56,0.71,2 +SM33,SM33_micro000,-5.38,0.85,2 +SM34,SM34_micro000,-4.47,0.65,2 +SM35,SM35_micro000, 0.11,1.00,2 +SM36,SM36_micro000,-1.59,0.94,2 +SM37,SM37_micro000,-1.33,0.94,2 +SM38,SM38_micro000, 0.34,0.89,2 +SM39,SM39_micro000,-0.66,0.94,2 +SM40,SM40_micro000,-0.60,0.88,2 +SM41,SM41_micro000,-3.25,0.70,2 +SM42,SM42_micro000,-4.49,0.71,2 +SM43,SM43_micro000,-4.99,0.66,2 +SM44,SM44_micro000,-2.89,0.72,2 +SM45,SM45_micro000,-5.21,0.90,2 +SM46,SM46_micro000,-4.96,0.65,2 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Piero Procacci + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Universita' di Firenze + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +NES-1 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +12000 core-hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +The HREM stage (gas-phase) for collecting the initial states with the +ghost solute in the solvent was done on a local Intel i7 quadcore +workstation in about 15 minutes on per molecule + +All NES-1 computations were done on the 24K cores CRESCO6 ENEA cluster +equipped with Intel Skylake 48 cores CPU 2.4 GHz. + +For each molecule in water and octanol we launched 480 trajectories (each with 6 NTHREADS) for a total of 2880 cores +per solvation/hydration free energy. + +The wall clock time was 36 minutes for each solvation/hydration free energy calculations. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +ORAC 6.0.1 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +Unidirectional nonequilibrium switching (NES-1) (see +doi:10.1007/s10822-019-00233-9) + +HREM stage +In NES-1, for both w and o solvent, only the compound configurations +at full decoupling (lambda=0, gas-phase) are canonically sampled using +Hamiltonian Replica Exchange with Torsional Tempering (HREM-TT) (doi: +10.1002/jcc.21388). Gas-phase HREM-TT (8 ns on the target state and +eight replicas with min. scaling factor of 0.1 [ 3000 K] on a single +molecule in an empty box ) are done by scaling along the replica +progression only the torsional potentials (including the 14 non bonded +interactions). + +NS stage +The starting configurations at lambda=0 (no coupling) +for the fast growth (NE) process are prepared by combining 480 +gas-phase H-REM sampled configurations with few snapshots taken from +NPT equilibrated runs of pure solvents. In water and in 1-octanol the +nonequilibrium swicthing (NES) alchemical switch-on (fast growth) time +was 300 ps in both solvents. LJ interaction were first turned on in +the first 240 ps and the molecule was recharged in the following 60 +ps. + +Post-processing stage +The solvation free energies are recovered using the Gaussian estimate +DG = - 0.5*\beta\sigma^2, evaluated on the 480 work values. The +NES simulations were performed under Periodic Boundary Conditions +(mean box size 40 nm^3 circa in all cases) in the NPT ensemble +(standard conditions, Parrinello Rahman isotropic stress, Nose' +thermostats ) using PME-Ewald (alpha=0.37 and 1 Angs grid spacing in +the direct space) and a LJ cut off of 1.3 nm. Dry ocatonol was used +in all cases. + +Force field section +The force field is GAFF2 (generalized Amber Force Field, version 2, +www.amber.org ). The FF parameters (bonded, charges and LJ) and +topology of the host and guests molecules were prepared using the +PrimaDORAC interface ( doi: 10.1021/acs.jcim.7b00145). The water model +is OPC3 ( doi: 10.1063/1.4960175.). 1-octanol parameterization was +done with PrimaDORAC. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-J.csv b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-J.csv new file mode 100644 index 00000000..dda52533 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-PieroProcacci-NES1-J.csv @@ -0,0 +1,198 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +# +# N.B. the third field is (DG_oct - DG_wat) where DG_oct/wat are the solvation free energies. +# The o/w partition coefficient is defined as LogP = - TFE/(RT log(10)) +# +Predictions: +SM25,SM25_micro000,-1.34,0.52,2 +SM26,SM26_micro000,0.77,0.97,2 +SM27,SM27_micro000,-1.58,0.91,2 +SM28,SM28_micro000,-0.26,1.13,2 +SM29,SM29_micro000,-1.38,0.88,2 +SM30,SM30_micro000,-3.23,0.79,2 +SM31,SM31_micro000,-3.36,0.52,2 +SM32,SM32_micro000,-2.81,0.67,2 +SM33,SM33_micro000,-5.01,0.72,2 +SM34,SM34_micro000,-4.40,0.80,2 +SM35,SM35_micro000,0.79,1.89,2 +SM36,SM36_micro000,-1.21,1.16,2 +SM37,SM37_micro000,-1.45,1.21,2 +SM38,SM38_micro000,1.25,1.23,2 +SM39,SM39_micro000,-1.79,1.08,2 +SM40,SM40_micro000,-0.38,0.83,2 +SM41,SM41_micro000,-3.36,0.95,2 +SM42,SM42_micro000,-3.96,0.83,2 +SM43,SM43_micro000,-5.16,1.09,2 +SM44,SM44_micro000,-3.15,1.02,2 +SM45,SM45_micro000,-5.38,1.19,2 +SM46,SM46_micro000,-4.45,1.32,2 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Piero Procacci + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Universita' di Firenze + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +NES-1 + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +12000 core-hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +The HREM stage (gas-phase) for collecting the initial states with the +ghost solute in the solvent was done on a local Intel i7 quadcore +workstation in about 15 minutes on per molecule + +All NES-1 computations were done on the 24K cores CRESCO6 ENEA cluster +equipped with Intel Skylake 48 cores CPU 2.4 GHz. + +For each molecule in water and octanol we launched 480 trajectories (each with 6 NTHREADS) for a total of 2880 cores +per solvation/hydration free energy. + +The wall clock time was 36 minutes for each solvation/hydration free energy calculations. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +ORAC 6.0.1 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +Unidirectional nonequilibrium switching (NES-1) (see +doi:10.1007/s10822-019-00233-9) + +HREM stage +In NES-1, for both w and o solvent, only the compound configurations +at full decoupling (lambda=0, gas-phase) are canonically sampled using +Hamiltonian Replica Exchange with Torsional Tempering (HREM-TT) (doi: +10.1002/jcc.21388). Gas-phase HREM-TT (8 ns on the target state and +eight replicas with min. scaling factor of 0.1 [ 3000 K] on a single +molecule in an empty box ) are done by scaling along the replica +progression only the torsional potentials (including the 14 non bonded +interactions). + +NS stage +The starting configurations at lambda=0 (no coupling) +for the fast growth (NE) process are prepared by combining 480 +gas-phase H-REM sampled configurations with few snapshots taken from +NPT equilibrated runs of pure solvents. In water and in 1-octanol the +nonequilibrium swicthing (NES) alchemical switch-on (fast growth) time +was 300 ps in both solvents. LJ interaction were first turned on in +the first 240 ps and the molecule was recharged in the following 60 +ps. + +Post-processing stage + +The solvation free energies are recovered using the Jarzynski estimate +evaluated on the 480 work values. The NES simulations were performed +under Periodic Boundary Conditions (mean box size 40 nm^3 circa in all +cases) in the NPT ensemble (standard conditions, Parrinello Rahman +isotropic stress, Nose' thermostats ) using PME-Ewald (alpha=0.37 and +1 Angs grid spacing in the direct space) and a LJ cut off of 1.3 nm. +Dry ocatonol was used in all cases. + +The force field is GAFF2 (generalized Amber Force Field, version 2, +www.amber.org ). The FF parameters (bonded, charges and LJ) and +topology of the host and guests molecules were prepared using the +PrimaDORAC interface ( doi: 10.1021/acs.jcim.7b00145). The water model +is OPC3 ( doi: 10.1063/1.4960175.). 1-octanol parameterization was +done with PrimaDORAC. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP-dddc-1.csv b/physical_property/logP/Analysis/Submissions/logP-dddc-1.csv new file mode 100644 index 00000000..d23fcdcf --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-dddc-1.csv @@ -0,0 +1,173 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +# Example of a submission that used only challenge provided neutral microstate (`SMXX_micro000` from the provided input files) +SM25,SM25_micro000,-3.01,0.08,0.50 +SM26,SM26_micro000,-1.17,0.08,0.50 +SM27,SM27_micro000,-1.99,0.12,0.50 +SM28,SM28_micro000,-1.37,0.28,0.50 +SM29,SM29_micro000,-1.94,0.05,0.50 +SM30,SM30_micro000,-3.57,0.07,0.50 +SM31,SM31_micro000,-2.56,0.23,0.50 +SM32,SM32_micro000,-2.59,0.15,0.50 +SM33,SM33_micro000,-4.56,0.10,0.50 +SM34,SM34_micro000,-3.16,0.20,0.50 +SM35,SM35_micro000,-1.97,0.12,0.50 +SM36,SM36_micro000,-3.86,0.36,0.50 +SM37,SM37_micro000,-2.63,0.44,0.50 +SM38,SM38_micro000,-1.10,0.02,0.50 +SM39,SM39_micro000,-2.79,0.09,0.50 +SM40,SM40_micro000,-1.72,0.37,0.50 +SM41,SM41_micro000,-2.78,0.08,0.50 +SM42,SM42_micro000,-4.53,0.00,0.50 +SM43,SM43_micro000,-3.33,0.32,0.50 +SM44,SM44_micro000,-1.08,0.31,0.50 +SM45,SM45_micro000,-2.91,0.10,0.50 +SM46,SM46_micro000,-1.97,0.05,0.50 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Xiaoyu Ding, Xutong Li + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Shanghai Institute of Materia Medica +University of Chinese Academy of Sciences + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-prediction-Attentive FP + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours, CPU +0.01 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +Attentive FP was trained and performed on one NVIDIA Tesla V100 on a single machine +hosting an Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +pytorch 1.4.0 +torch-geometric 1.3.2 +theano 1.0.4 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Data Curation and Preparation +Data were collected from literature and databases. More detail are not shown. +The compound structures were cleaned and normalized by our in-house script. +If a compound had multiple logP, the arithmetic means of all values as the final annotation, +unless the reported logP values covered a range of more than 0.5. +Since the number of records is too large, random samping was performed on the full datset. +About half of the compounds were randomly selected for model buiding within the limited time. +The final dataset consists of 13374 compounds. + +Model building +All compounds were randomly dividied into training and test set by a ratio of 4:1. +Five-fold cross-validation was performed within the training dataset duringthe training process. +In detail, training dataset was split into five folds +and each fold is then used onceas a validation, whereas the four remaining folds form the training set. +For each fold, early stopping was applied and the training process. +We set a maximum epoch of 500, and if the performance root mean squared error (RMSE) had not improved in 8 epochs on the training set and in 18 epochs on the validation set, +the training process was terminated early. +Attentive FP[1] that have previously reported by our group was applied to build prediction model. +The six hyper-parameters were set as following: k (the number of attentive layers for atom embedding) = 4, +t (the number of attentive layers for molecule embedding) = 4, +fingerprint dimension = 256, L2 weight decay = 10^-5, learning rate = 10^-5, and dropout rate = 0.4. +Finally, the predictions of logP were converted into transfer free energy (TFE) through multiplying by -1.364. +We performed three independent runs with different random seeds to train the model and obtain the standard error of the mean (SEM) as a measure of statistical uncertainty. +Model uncertainty is calculated as the RMSE between predicted and experiment values for test set. + +[1] Xiong Z, Wang D, Liu X, et al. Pushing the Boundaries of Molecular Representation for Drug Discovery with the Graph Attention Mechanism[J]. Journal of Medicinal Chemistry, 2019. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP-dddc-2.csv b/physical_property/logP/Analysis/Submissions/logP-dddc-2.csv new file mode 100644 index 00000000..57fea966 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP-dddc-2.csv @@ -0,0 +1,172 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, `SM26_micro000 SM26_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +# Example of a submission that used only challenge provided neutral microstate (`SMXX_micro000` from the provided input files) +SM25,SM25_micro000,-3.17,0.19,0.41 +SM26,SM26_micro000,-1.45,0.19,0.41 +SM27,SM27_micro000,-2.10,0.24,0.41 +SM28,SM28_micro000,-1.97,0.24,0.41 +SM29,SM29_micro000,-2.08,0.31,0.41 +SM30,SM30_micro000,-3.83,0.31,0.41 +SM31,SM31_micro000,-2.29,0.28,0.41 +SM32,SM32_micro000,-3.05,0.41,0.41 +SM33,SM33_micro000,-4.89,0.38,0.41 +SM34,SM34_micro000,-3.22,0.29,0.41 +SM35,SM35_micro000,-1.99,0.61,0.41 +SM36,SM36_micro000,-3.84,0.75,0.41 +SM37,SM37_micro000,-2.13,0.37,0.41 +SM38,SM38_micro000,-1.34,0.25,0.41 +SM39,SM39_micro000,-3.36,0.41,0.41 +SM40,SM40_micro000,-1.75,0.37,0.41 +SM41,SM41_micro000,-2.20,0.17,0.41 +SM42,SM42_micro000,-4.12,0.20,0.41 +SM43,SM43_micro000,-2.69,0.36,0.41 +SM44,SM44_micro000,-1.30,0.19,0.41 +SM45,SM45_micro000,-2.93,0.30,0.41 +SM46,SM46_micro000,-1.74,0.31,0.41 + +# +# +# Please list your name," using only UTF-8 characters as described above. The ""Participant name:"" entry is required." +Participant name: +Xiaoyu Ding, Xutong Li + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Shanghai Institute of Materia Medica +University of Chinese Academy of Sciences + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +GROVER + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +GROVER was trained and performed on one NVIDIA P40 on a single machine +hosting an Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +pytorch1.1 +python3.6 +rdkit2019.03.3.0 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Data Curation and Preparation +Data were collected from literature and databases. More detail are not shown. +The compound structures were cleaned and normalized by our in-house script. +If a compound had multiple logP, the arithmetic means of all values as the final annotation, +unless the reported logP values covered a range of more than 0.5. +Since the number of records is too large, random samping was performed on the full datset. +About half of the compounds were randomly selected for model buiding within the limited time. +The final dataset consists of 13374 compounds. + +Model building +All compounds were randomly dividied into training and test set by a ratio of 4:1. +Five-fold cross-validation was performed within the training dataset during the training process,5 models were build for each fold with different initial parameters for ensemble. +In detail, training dataset was split into five folds +and each fold is then used once as a validation, whereas the four remaining folds form the training set. +Five models were build for each fold with different initial parameters for ensemble. +For each fold, early stopping was applied and the training process. +We set a maximum epoch of 80, and if the performance root mean squared error (RMSE) had not improved in 10 epochs on the validation set, +the training process was terminated early. +GROVER[1] that have previously reported by Rong et al. was applied to build prediction model. +The six hyper-parameters were set as following: +batch_size = 32, +attn_hidden = 16, L2 weight decay = 10^-7, learning rate = 10^-5, and dropout rate = 0. +Finally, the predictions of logP were converted into transfer free energy (TFE) through multiplying by -1.364. +We performed five independent runs with different random seeds to train the model and obtain the standard error of the mean (SEM) as a measure of statistical uncertainty. +Model uncertainty is calculated as the RMSE between predicted and experiment values for test set. +[1] Rong Y, Bian Y, Xu T, et al. GROVER: Self-supervised Message Passing Transformer on Large-scale Molecular Data[J]. arXiv preprint arXiv:2007.02835, 2020. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_1.csv b/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_1.csv new file mode 100644 index 00000000..bfc20fcd --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_1.csv @@ -0,0 +1,167 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-4.06,0.07,2.12 +SM26,SM26_micro000,-0.18,0.08,2.12 +SM27,SM27_micro000,-2.58,0.10,2.12 +SM28,SM28_micro000,-1.92,0.22,2.12 +SM29,SM29_micro000,-2.19,0.10,2.12 +SM30,SM30_micro000,-5.01,0.11,2.12 +SM31,SM31_micro000,-2.71,0.11,2.12 +SM32,SM32_micro000,-4.15,0.09,2.12 +SM33,SM33_micro000,-7.53,0.09,2.12 +SM34,SM34_micro000,-6.13,0.13,2.12 +SM35,SM35_micro000,-2.00,0.09,2.12 +SM36,SM36_micro000,-2.06,0.11,2.12 +SM37,SM37_micro000,-2.00,0.10,2.12 +SM38,SM38_micro000,-1.38,0.11,2.12 +SM39,SM39_micro000,-3.81,0.14,2.12 +SM40,SM40_micro000,-2.72,0.11,2.12 +SM41,SM41_micro000,-3.81,0.07,2.12 +SM42,SM42_micro000,-4.87,0.19,2.12 +SM43,SM43_micro000,-3.74,0.06,2.12 +SM44,SM44_micro000,-2.25,0.06,2.12 +SM45,SM45_micro000,-4.81,0.08,2.12 +SM46,SM46_micro000,-3.51,0.13,2.12 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-MD-neat-octanol + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +2800, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). Calculation involving water +were peformed using 10 processors per calculation, while the calculation involving octanol +were performed using 8 processors per calculation. The reported compute time is taken as the +walltime times the number of processors (core hours). Pitzer has a mix of processors. We did not specify the +processor type upon submission, so a mix was used. All of the calculations were performed during a +trial period during the cluster expansion. The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +gromacs/2020.2 +amber/20 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using molecular dynamics with Gromacs plus MBAR at 298.15 K and 1 bar +in water (pure) and 1-octanol (pure). The solute was coupled/decoupled using a total of 15 states (including +ideal gas and fully interacting states). After equilibration, 16 ns tranectories were used for the free energy calculations. +The uncertainty in the solvation free energy calculations was computed using a timeseries analysis +as implemented in PyMBAR. The model uncertainty was taken as the rough estimate used in the +SAMPL6 overview publication. + +Water and 1-octanol were modeled using TIP4P and TraPPE-UA, respectfully. +In this set of calculation 1-octanol was modeled as pure. In a second set of +calculation we will use water-saturated 1-octanol. The combination of models +was used as their phase behvaior has been rigorously studied in the past, with +LLE calculations performed to determine mutual solubilities. + +The solutes were modeled using GAFF with point charges obtained using RESP +with HF/6-31G(d). + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_2.csv b/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_2.csv new file mode 100644 index 00000000..e6cce2f1 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_AndrewPaluch_MD_2.csv @@ -0,0 +1,169 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-5.60,0.08,2.12 +SM26,SM26_micro000,-2.25,0.18,2.12 +SM27,SM27_micro000,-3.14,0.21,2.12 +SM28,SM28_micro000,-2.73,0.10,2.12 +SM29,SM29_micro000,-3.22,0.21,2.12 +SM30,SM30_micro000,-4.21,0.12,2.12 +SM31,SM31_micro000,-3.85,0.11,2.12 +SM32,SM32_micro000,-4.22,0.08,2.12 +SM33,SM33_micro000,-6.35,0.09,2.12 +SM34,SM34_micro000,-7.01,0.13,2.12 +SM35,SM35_micro000,-2.72,0.17,2.12 +SM36,SM36_micro000,-4.86,0.13,2.12 +SM37,SM37_micro000,-2.80,0.22,2.12 +SM38,SM38_micro000,-2.49,0.11,2.12 +SM39,SM39_micro000,-4.64,0.12,2.12 +SM40,SM40_micro000,-3.72,0.20,2.12 +SM41,SM41_micro000,-4.16,0.16,2.12 +SM42,SM42_micro000,-5.13,0.11,2.12 +SM43,SM43_micro000,-4.37,0.16,2.12 +SM44,SM44_micro000,-3.65,0.18,2.12 +SM45,SM45_micro000,-4.87,0.19,2.12 +SM46,SM46_micro000,-4.26,0.06,2.12 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-MD-wet-octanol + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +2900, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). Calculation involving water +were peformed using 10 processors per calculation, while the calculation involving octanol +were performed using 8 processors per calculation. The reported compute time is taken as the +walltime times the number of processors (core hours). Pitzer has a mix of processors. We did not specify the +processor type upon submission, so a mix was used. All of the calculations were performed during a +trial period during the cluster expansion. The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +gromacs/2020.2 +amber/20 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using molecular dynamics with Gromacs plus MBAR at 298.15 K and 1 bar +in water (pure) and 1-octanol (water-satured). Previous work has determined that the mole frac of water in +water-satured 1-octanol to be 0.21 for the solvent force field combinatin used. +The solute was coupled/decoupled using a total of 15 states (including +ideal gas and fully interacting states). After equilibration, 16 ns tranectories were used for the free energy calculations. +The uncertainty in the solvation free energy calculations was computed using a timeseries analysis +as implemented in PyMBAR. The model uncertainty was taken as the rough estimate used in the +SAMPL6 overview publication. + +Water and 1-octanol were modeled using TIP4P and TraPPE-UA, respectfully. +In this set of calculation 1-octanol was modeled as water-saturated. +This serve as a comparison to our first set of calculations using neat 1-octanol. +The combination of models was used as their phase behvaior has been rigorously +studied in the past, with LLE calculations performed to determine mutual solubilities. + +The solutes were modeled using GAFF with point charges obtained using RESP +with HF/6-31G(d). + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM12_1.csv b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM12_1.csv new file mode 100644 index 00000000..2d3168b2 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM12_1.csv @@ -0,0 +1,164 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# SM12 with vacuum optimized geometries +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-2.60,0.00,0.69 +SM26,SM26_micro000,-1.13,0.00,0.69 +SM27,SM27_micro000,-1.49,0.00,0.69 +SM28,SM28_micro000,-1.40,0.00,0.69 +SM29,SM29_micro000,-1.08,0.00,0.69 +SM30,SM30_micro000,-3.38,0.00,0.69 +SM31,SM31_micro000,-1.36,0.00,0.69 +SM32,SM32_micro000,-3.24,0.00,0.69 +SM33,SM33_micro000,-5.58,0.00,0.69 +SM34,SM34_micro000,-3.30,0.00,0.69 +SM35,SM35_micro000,-1.69,0.00,0.69 +SM36,SM36_micro000,-3.33,0.00,0.69 +SM37,SM37_micro000,-1.40,0.00,0.69 +SM38,SM38_micro000,0.07,0.00,0.69 +SM39,SM39_micro000,-2.30,0.00,0.69 +SM40,SM40_micro000,-0.40,0.00,0.69 +SM41,SM41_micro000,-0.82,0.00,0.69 +SM42,SM42_micro000,-3.66,0.00,0.69 +SM43,SM43_micro000,-1.34,0.00,0.69 +SM44,SM44_micro000,0.13,0.00,0.69 +SM45,SM45_micro000,-2.78,0.00,0.69 +SM46,SM46_micro000,-0.64,0.00,0.69 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-SM12-vacuum-opt + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +90, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +All of the calculations were performed during a trial period during the cluster expansion. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +QChem/5.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using electronic structure calculations using the SM12 continuum +solvation model in water and 1-octanol. All calculations were performed using QChem/5.3. +First, geometry optimizations were performed at the M062X/6-31+G(d,p) level of theory/basis set in vacuum. +This vacuum optimized geometry was then used to perform single point energy calculations in vacuum +and in the solvents at the same level of theory/basis sets. + +The uncertainty in the solvation free energy calculations was taken to be 0. Given a fixed geometry, +the computed single point energy calculations should remain unchanged. The model uncertainty +was taken from our untrained SM12 submission in the SAMPL6 challenge, converting the computed +RMSE in logP to a RMSE in TFE in kcal/mol. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_1.csv b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_1.csv new file mode 100644 index 00000000..4c1d66b0 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_1.csv @@ -0,0 +1,164 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# SM8 with vacuum optimized geometries +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-2.18,0.00,0.60 +SM26,SM26_micro000,-1.75,0.00,0.60 +SM27,SM27_micro000,-2.28,0.00,0.60 +SM28,SM28_micro000,-1.96,0.00,0.60 +SM29,SM29_micro000,-1.85,0.00,0.60 +SM30,SM30_micro000,-3.67,0.00,0.60 +SM31,SM31_micro000,-2.05,0.00,0.60 +SM32,SM32_micro000,-3.37,0.00,0.60 +SM33,SM33_micro000,-5.39,0.00,0.60 +SM34,SM34_micro000,-3.47,0.00,0.60 +SM35,SM35_micro000,-1.48,0.00,0.60 +SM36,SM36_micro000,-3.11,0.00,0.60 +SM37,SM37_micro000,-1.67,0.00,0.60 +SM38,SM38_micro000,-0.44,0.00,0.60 +SM39,SM39_micro000,-2.33,0.00,0.60 +SM40,SM40_micro000,-0.66,0.00,0.60 +SM41,SM41_micro000,-1.02,0.00,0.60 +SM42,SM42_micro000,-2.78,0.00,0.60 +SM43,SM43_micro000,-1.26,0.00,0.60 +SM44,SM44_micro000,-0.39,0.00,0.60 +SM45,SM45_micro000,-2.49,0.00,0.60 +SM46,SM46_micro000,-0.92,0.00,0.60 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-SM8-vacuum-opt + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +20, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +All of the calculations were performed during a trial period during the cluster expansion. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +QChem/5.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using electronic structure calculations using the SM8 continuum +solvation model in water and 1-octanol. All calculations were performed using QChem/5.3. +First, geometry optimizations were performed at the M062X/6-31+G(d,p) level of theory/basis set in vacuum. +This vacuum optimized geometry was then used to perform single point energy calculations in vacuum +and in the solvents at the same level of theory/basis set. + +The uncertainty in the solvation free energy calculations was taken to be 0. Given a fixed geometry, +the computed single point energy calculations should remain unchanged. The model uncertainty +was taken from our untrained SM8 submission in the SAMPL6 challenge, converting the computed +RMSE in logP to a RMSE in TFE in kcal/mol. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_2.csv b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_2.csv new file mode 100644 index 00000000..df9227de --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SM8_2.csv @@ -0,0 +1,164 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# SM8 with solvent optimized geometries +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-1.58,0.00,0.60 +SM26,SM26_micro000,-1.39,0.00,0.60 +SM27,SM27_micro000,-1.78,0.00,0.60 +SM28,SM28_micro000,-1.39,0.00,0.60 +SM29,SM29_micro000,-1.37,0.00,0.60 +SM30,SM30_micro000,-3.03,0.00,0.60 +SM31,SM31_micro000,-1.45,0.00,0.60 +SM32,SM32_micro000,-2.89,0.00,0.60 +SM33,SM33_micro000,-4.82,0.00,0.60 +SM34,SM34_micro000,-2.79,0.00,0.60 +SM35,SM35_micro000,-1.07,0.00,0.60 +SM36,SM36_micro000,-2.17,0.00,0.60 +SM37,SM37_micro000,-0.48,0.00,0.60 +SM38,SM38_micro000,0.65,0.00,0.60 +SM39,SM39_micro000,-1.12,0.00,0.60 +SM40,SM40_micro000,0.48,0.00,0.60 +SM41,SM41_micro000,-0.60,0.00,0.60 +SM42,SM42_micro000,-2.47,0.00,0.60 +SM43,SM43_micro000,-0.28,0.00,0.60 +SM44,SM44_micro000,0.09,0.00,0.60 +SM45,SM45_micro000,-1.96,0.00,0.60 +SM46,SM46_micro000,-0.23,0.00,0.60 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-SM8-solvent-opt + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +1090, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +All of the calculations were performed during a trial period during the cluster expansion. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +QChem/5.3 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using electronic structure calculations using the SM8 continuum +solvation model in water and 1-octanol. All calculations were performed using QChem/5.3. +First, geometry optimization and single point energy calculations were performed at the M062X/6-31+G(d,p) level of theory/basis set in vacuum. +The vacuum optimized geometry then became the starting point for geometry optimizations and single point energy calculations in +water and 1-octanol, using the same level of theory/basis set. + +The uncertainty in the solvation free energy calculations was taken to be 0. Given a fixed geometry, +the computed single point energy calculations should remain unchanged. The model uncertainty +was taken from our untrained SM8 submission in the SAMPL6 challenge, converting the computed +RMSE in logP to a RMSE in TFE in kcal/mol. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_1.csv b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_1.csv new file mode 100644 index 00000000..544febee --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_1.csv @@ -0,0 +1,163 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# SMD with vacuum optimized geometries +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-0.90,0.00,1.47 +SM26,SM26_micro000,0.08,0.00,1.47 +SM27,SM27_micro000,0.08,0.00,1.47 +SM28,SM28_micro000,0.59,0.00,1.47 +SM29,SM29_micro000,0.60,0.00,1.47 +SM30,SM30_micro000,-1.85,0.00,1.47 +SM31,SM31_micro000,-0.26,0.00,1.47 +SM32,SM32_micro000,-1.51,0.00,1.47 +SM33,SM33_micro000,-3.80,0.00,1.47 +SM34,SM34_micro000,-2.06,0.00,1.47 +SM35,SM35_micro000,0.45,0.00,1.47 +SM36,SM36_micro000,-0.27,0.00,1.47 +SM37,SM37_micro000,0.79,0.00,1.47 +SM38,SM38_micro000,2.97,0.00,1.47 +SM39,SM39_micro000,0.78,0.00,1.47 +SM40,SM40_micro000,1.90,0.00,1.47 +SM41,SM41_micro000,1.12,0.00,1.47 +SM42,SM42_micro000,-1.39,0.00,1.47 +SM43,SM43_micro000,0.14,0.00,1.47 +SM44,SM44_micro000,2.71,0.00,1.47 +SM45,SM45_micro000,0.20,0.00,1.47 +SM46,SM46_micro000,1.46,0.00,1.47 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-SMD-vacuum-opt + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +22, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gaussian 16 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using electronic structure calculations using the SMD continuum +solvation model in water and 1-octanol. All calculations were performed using Gaussian 16. +First, geometry optimizations were performed at the M062X/6-31+G(d,p) level of theory/basis set in vacuum. +This vacuum optimized geometry was then used to perform single point energy calculations in vacuum +and in the solvents at the same level of theory/basis set. + +The uncertainty in the solvation free energy calculations was taken to be 0. Given a fixed geometry, +the computed single point energy calculations should remain unchanged. The model uncertainty +was taken from our untrained SMD submission in the SAMPL6 challenge, converting the computed +RMSE in logP to a RMSE in TFE in kcal/mol. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_2.csv b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_2.csv new file mode 100644 index 00000000..dd4de2b6 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_RodriguezPaluch_SMD_2.csv @@ -0,0 +1,163 @@ +# WATER-OCTANOL (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# SMD with solvent optimized geometries +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-0.58,0.00,1.47 +SM26,SM26_micro000,0.57,0.00,1.47 +SM27,SM27_micro000,-0.01,0.00,1.47 +SM28,SM28_micro000,1.08,0.00,1.47 +SM29,SM29_micro000,1.04,0.00,1.47 +SM30,SM30_micro000,-1.59,0.00,1.47 +SM31,SM31_micro000,0.18,0.00,1.47 +SM32,SM32_micro000,-1.17,0.00,1.47 +SM33,SM33_micro000,-3.56,0.00,1.47 +SM34,SM34_micro000,-1.25,0.00,1.47 +SM35,SM35_micro000,1.08,0.00,1.47 +SM36,SM36_micro000,0.23,0.00,1.47 +SM37,SM37_micro000,1.42,0.00,1.47 +SM38,SM38_micro000,3.75,0.00,1.47 +SM39,SM39_micro000,1.64,0.00,1.47 +SM40,SM40_micro000,2.96,0.00,1.47 +SM41,SM41_micro000,1.42,0.00,1.47 +SM42,SM42_micro000,-1.02,0.00,1.47 +SM43,SM43_micro000,1.72,0.00,1.47 +SM44,SM44_micro000,2.95,0.00,1.47 +SM45,SM45_micro000,1.17,0.00,1.47 +SM46,SM46_micro000,2.55,0.00,1.47 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-SMD-solvent-opt + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +44, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gaussian 16 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Solvation free energy calculations were performed using electronic structure calculations using the SMD continuum +solvation model in water and 1-octanol. All calculations were performed using Gaussian 16. +First, geometry optimizations and single point energy calculations were performed at the M062X/6-31+G(d,p) level of theory/basis set in vacuum. +This vacuum optimized geometry was then used as a starting point to perform geometry optmizations and +single point energy calculations in water and 1-octanol at the same level of theory/basis set. + +The uncertainty in the solvation free energy calculations was taken to be 0. Given a fixed geometry, +the computed single point energy calculations should remain unchanged. The model uncertainty +was taken from our untrained SMD submission in the SAMPL6 challenge, converting the computed +RMSE in logP to a RMSE in TFE in kcal/mol. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_CGenFF.csv b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_CGenFF.csv new file mode 100644 index 00000000..a782c3f5 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_CGenFF.csv @@ -0,0 +1,151 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. +# +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro004,-6.55,0.18,1.50 +SM26,SM26_micro000,-2.39,0.11,1.50 +SM27,SM27_micro000,-3.31,0.13,1.50 +SM28,SM28_micro000,-2.52,0.13,1.50 +SM29,SM29_micro000,-2.64,0.15,1.50 +SM30,SM30_micro000,-5.73,0.14,1.50 +SM31,SM31_micro000,-4.16,0.13,1.50 +SM32,SM32_micro000,-5.95,0.16,1.50 +SM33,SM33_micro000,-7.44,0.16,1.50 +SM34,SM34_micro000,-6.08,0.14,1.50 +SM35,SM35_micro000,-2.70,0.36,1.50 +SM36,SM36_micro000,-3.65,0.60,1.50 +SM37,SM37_micro000,-3.31,0.75,1.50 +SM38,SM38_micro000,-2.26,0.41,1.50 +SM39,SM39_micro000,-5.10,0.41,1.50 +SM40,SM40_micro000,-3.54,0.31,1.50 +SM41,SM41_micro000,-3.53,0.18,1.50 +SM42,SM42_micro003,-7.01,0.13,1.50 +SM43,SM43_micro004,-5.10,0.16,1.50 +SM44,SM44_micro000,-1.15,0.22,1.50 +SM45,SM45_micro000,-5.15,0.41,1.50 +SM46,SM46_micro000,-3.13,0.29,1.50 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bogdan I. Iorga/Oliver Beckstein +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +ICSN, CNRS, Gif-sur-Yvette, France/Arizona State University, USA +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +SAMPL7_logP_MDPOW_CGenFF +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +20,000 hours, CPU +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed in parallel (8 cores for each simulation) on cluster nodes running with CentOS6 and 4 CPU Intel Xeon E5-4627 v3 @ 2.60GHz. +# +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gromacs 2020.3 +MDPOW 0.7.0-dev +CGENFF 2.2.0 +# +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) +# +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Alchemical free energy calculations were performed in explicit +solvent, following the protocol described in [1,2]. Parameters were generated with the +PARAMCHEM CGENFF program via the server at +https://cgenff.umaryland.edu/ for CHARMM/CGenFF with the CHARMM TIP3P +water model. Files were prepared for Gromacs 2020.3. The alchemical data were +analyzed with thermodynamic integration. Errors are reported as errors +of the mean (see [1,2]). The +model uncertainty was estimated on the basis of the results from [2]. +[1] Kenney, I. M., Beckstein, O., and Iorga, B. I. (2016) Prediction of cyclohexane-water +distribution coefficients for the SAMPL5 data set using molecular dynamics simulations with +the OPLS-AA force field, J. Comput. Aided Mol. Des. 30(11):1045-1058 DOI: 10.1007/s10822-016-9949-5. +[2] Fan, S., Iorga, B. I., and Beckstein, O. (2020) Prediction of octanol-water partition coefficients for +the SAMPL6-logP molecules using molecular dynamics simulations with OPLS-AA, AMBER and CHARMM force fields, +J Comput Aided Mol Des 34(5):543-560 DOI: 10.1007/s10822-019-00267-z. +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_GAFF.csv b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_GAFF.csv new file mode 100644 index 00000000..9e06d09f --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_GAFF.csv @@ -0,0 +1,156 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. +# +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro004,-4.76,0.33,1.50 +SM26,SM26_micro000,-1.60,0.20,1.50 +SM27,SM27_micro000,-3.84,0.35,1.50 +SM28,SM28_micro000,-3.08,0.34,1.50 +SM29,SM29_micro000,-3.18,0.24,1.50 +SM30,SM30_micro000,-5.30,0.42,1.50 +SM31,SM31_micro000,-4.34,0.25,1.50 +SM32,SM32_micro000,-4.57,0.24,1.50 +SM33,SM33_micro000,-5.64,0.24,1.50 +SM34,SM34_micro000,-5.10,0.32,1.50 +SM35,SM35_micro000,-2.44,0.49,1.50 +SM36,SM36_micro000,-4.92,0.47,1.50 +SM37,SM37_micro000,-3.78,0.42,1.50 +SM38,SM38_micro000,-2.59,0.53,1.50 +SM39,SM39_micro000,-5.06,0.54,1.50 +SM40,SM40_micro000,-4.77,0.49,1.50 +SM41,SM41_micro000,-3.37,0.38,1.50 +SM42,SM42_micro003,-5.75,0.28,1.50 +SM43,SM43_micro004,-3.91,0.24,1.50 +SM44,SM44_micro000,-3.11,0.25,1.50 +SM45,SM45_micro000,-4.91,0.28,1.50 +SM46,SM46_micro000,-4.10,0.19,1.50 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bogdan I. Iorga/Oliver Beckstein +# +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +ICSN, CNRS, Gif-sur-Yvette, France/Arizona State University, USA +# +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +SAMPL7_logP_MDPOW_GAFF +# +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +20,000 hours, CPU +# +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed in parallel (8 cores for each simulation) on cluster nodes running with CentOS6 and 4 CPU Intel Xeon E5-4627 v3 @ 2.60GHz. +# +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gromacs 2020.3 +MDPOW 0.7.0-dev +AmberTools +ACPYPE 0 (2017) +# +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) +# +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Alchemical free energy calculations were performed in explicit +solvent, following the protocol described in [1,2]. Parameters were generated with +Antechamber from AmberTools and ACPYPE for AMBER (GAFF) with the TIP3P +water model. Files were prepared for Gromacs 2020.3. The alchemical data +were analyzed with thermodynamic integration. Errors are reported as +errors of the mean (see [1,2]). The +model uncertainty was estimated on the basis of the results from [2]. +[1] Kenney, I. M., Beckstein, O., and Iorga, B. I. (2016) Prediction of cyclohexane-water +distribution coefficients for the SAMPL5 data set using molecular dynamics simulations with +the OPLS-AA force field, J. Comput. Aided Mol. Des. 30(11):1045-1058 DOI: 10.1007/s10822-016-9949-5. +[2] Fan, S., Iorga, B. I., and Beckstein, O. (2020) Prediction of octanol-water partition coefficients for +the SAMPL6-logP molecules using molecular dynamics simulations with OPLS-AA, AMBER and CHARMM force fields, +J Comput Aided Mol Des 34(5):543-560 DOI: 10.1007/s10822-019-00267-z. +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_LigParGen.csv b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_LigParGen.csv new file mode 100644 index 00000000..fae9906a --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_LigParGen.csv @@ -0,0 +1,155 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. +# +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro004,-2.06,0.74,1.50 +SM26,SM26_micro000,-1.32,0.33,1.50 +SM27,SM27_micro000,2.08,0.36,1.50 +SM28,SM28_micro000,-1.75,0.50,1.50 +SM29,SM29_micro000,2.47,0.50,1.50 +SM30,SM30_micro000,0.78,0.51,1.50 +SM31,SM31_micro000,-5.45,0.23,1.50 +SM32,SM32_micro000,-0.62,0.38,1.50 +SM33,SM33_micro000,-2.01,0.56,1.50 +SM34,SM34_micro000,-7.91,0.24,1.50 +SM35,SM35_micro000,4.17,0.46,1.50 +SM36,SM36_micro000,1.80,0.96,1.50 +SM37,SM37_micro000,-5.76,0.33,1.50 +SM38,SM38_micro000,1.61,0.54,1.50 +SM39,SM39_micro000,-3.43,0.50,1.50 +SM40,SM40_micro000,-3.47,0.56,1.50 +SM41,SM41_micro000,-2.79,0.26,1.50 +SM42,SM42_micro003,-4.40,0.41,1.50 +SM43,SM43_micro004,-6.70,0.35,1.50 +SM44,SM44_micro000,-2.46,0.27,1.50 +SM45,SM45_micro000,-4.59,0.46,1.50 +SM46,SM46_micro000,1.16,0.50,1.50 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bogdan I. Iorga/Oliver Beckstein +# +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +ICSN, CNRS, Gif-sur-Yvette, France/Arizona State University, USA +# +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +SAMPL7_logP_MDPOW_LigParGen +# +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +20,000 hours, CPU +# +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed in parallel (8 cores for each simulation) on cluster nodes running with CentOS6 and 4 CPU Intel Xeon E5-4627 v3 @ 2.60GHz. +# +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gromacs 2020.3 +MDPOW 0.7.0-dev +LigParGen 2.1 +# +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) +# +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Alchemical free energy calculations were performed in explicit +solvent, following the protocol described in [1,2]. Parameters were generated with the +OPLS/CM1A LigParGen server http://zarbi.chem.yale.edu/ligpargen/ for +OPLS-AA with the TIP4P water model. Files were prepared for Gromacs +2020.3. The alchemical data were analyzed with thermodynamic +integration. Errors are reported as errors of the mean (see [1,2]). The +model uncertainty was estimated on the basis of the results from [2]. +[1] Kenney, I. M., Beckstein, O., and Iorga, B. I. (2016) Prediction of cyclohexane-water +distribution coefficients for the SAMPL5 data set using molecular dynamics simulations with +the OPLS-AA force field, J. Comput. Aided Mol. Des. 30(11):1045-1058 DOI: 10.1007/s10822-016-9949-5. +[2] Fan, S., Iorga, B. I., and Beckstein, O. (2020) Prediction of octanol-water partition coefficients for +the SAMPL6-logP molecules using molecular dynamics simulations with OPLS-AA, AMBER and CHARMM force fields, +J Comput Aided Mol Des 34(5):543-560 DOI: 10.1007/s10822-019-00267-z. +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_OPLS-AA.csv b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_OPLS-AA.csv new file mode 100644 index 00000000..739d8d67 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logP_prediction_Iorga_Beckstein_OPLS-AA.csv @@ -0,0 +1,151 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro004,-4.97,0.30,1.50 +SM26,SM26_micro000,-1.88,0.31,1.50 +SM27,SM27_micro000,-2.69,0.25,1.50 +SM28,SM28_micro000,-1.92,0.36,1.50 +SM29,SM29_micro000,-0.91,0.25,1.50 +SM30,SM30_micro000,-5.97,0.24,1.50 +SM31,SM31_micro000,-5.45,0.23,1.50 +SM32,SM32_micro000,-5.13,0.30,1.50 +SM33,SM33_micro000,-6.61,0.31,1.50 +SM34,SM34_micro000,-7.91,0.24,1.50 +SM35,SM35_micro000,-1.07,0.45,1.50 +SM36,SM36_micro000,-5.22,0.51,1.50 +SM37,SM37_micro000,-5.76,0.33,1.50 +SM38,SM38_micro000,3.79,0.54,1.50 +SM39,SM39_micro000,-2.17,0.45,1.50 +SM40,SM40_micro000,-3.47,0.56,1.50 +SM41,SM41_micro000,-2.93,0.22,1.50 +SM42,SM42_micro003,-6.48,0.38,1.50 +SM43,SM43_micro004,-6.70,0.35,1.50 +SM44,SM44_micro000,-3.61,0.21,1.50 +SM45,SM45_micro000,-7.61,0.19,1.50 +SM46,SM46_micro000,-6.94,0.24,1.50 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bogdan I. Iorga/Oliver Beckstein +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +ICSN, CNRS, Gif-sur-Yvette, France/Arizona State University, USA +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +SAMPL7_logP_MDPOW_OPLS-AA +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +20,000 hours, CPU +# +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed in parallel (8 cores for each simulation) on cluster nodes running with CentOS6 and 4 CPU Intel Xeon E5-4627 v3 @ 2.60GHz. +# +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gromacs 2020.3 +MDPOW 0.7.0-dev +mol2ff +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (MM) +# +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Alchemical free energy calculations were performed in explicit +solvent, following the protocol described in [1,2]. Parameters were generated with the +OPLSAA mol2ff software (B.I. Iorga, unpublished; also used in [1,2]) for +OPLS-AA with the TIP4P water model. Files were prepared for Gromacs +2020.3. The alchemical data were analyzed with thermodynamic +integration. Errors are reported as errors of the mean (see [1,2]). The +model uncertainty was estimated on the basis of the results from [2]. +[1] Kenney, I. M., Beckstein, O., and Iorga, B. I. (2016) Prediction of cyclohexane-water +distribution coefficients for the SAMPL5 data set using molecular dynamics simulations with +the OPLS-AA force field, J. Comput. Aided Mol. Des. 30(11):1045-1058 DOI: 10.1007/s10822-016-9949-5. +[2] Fan, S., Iorga, B. I., and Beckstein, O. (2020) Prediction of octanol-water partition coefficients for +the SAMPL6-logP molecules using molecular dynamics simulations with OPLS-AA, AMBER and CHARMM force fields, +J Comput Aided Mol Des 34(5):543-560 DOI: 10.1007/s10822-019-00267-z. +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logp-nhlbi-1.csv b/physical_property/logP/Analysis/Submissions/logp-nhlbi-1.csv new file mode 100644 index 00000000..db11cce7 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp-nhlbi-1.csv @@ -0,0 +1,96 @@ +# WATER-OCTANOL TRANSFER FREE ENERGY PREDICTIONS +# The energy units must be in kcal/mol. + + +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +# ID tag, Molecule ID, TFE, TFE SEM, TFE model uncertainty +Predictions: +SM25,SM25_micro000,2.10,0.00,0.00 +SM26,SM26_micro000,0.15,0.00,0.00 +SM27,SM27_micro000,0.64,0.00,0.00 +SM28,SM28_micro000,-0.31,0.00,0.00 +SM29,SM29_micro000,0.24,0.00,0.00 +SM30,SM30_micro000,2.36,0.00,0.00 +SM31,SM31_micro000,1.28,0.00,0.00 +SM32,SM32_micro000,2.04,0.00,0.00 +SM33,SM33_micro000,4.26,0.00,0.00 +SM34,SM34_micro000,2.98,0.00,0.00 +SM35,SM35_micro000,-0.94,0.00,0.00 +SM36,SM36_micro000,0.79,0.00,0.00 +SM37,SM37_micro000,1.07,0.00,0.00 +SM38,SM38_micro000,-2.18,0.00,0.00 +SM39,SM39_micro000,0.64,0.00,0.00 +SM40,SM40_micro000,-0.97,0.00,0.00 +SM41,SM41_micro000,-0.83,0.00,0.00 +SM42,SM42_micro000,1.45,0.00,0.00 +SM43,SM43_micro000,0.12,0.00,0.00 +SM44,SM44_micro000,-2.09,0.00,0.00 +SM45,SM45_micro000,-0.35,0.00,0.00 +SM46,SM46_micro000,-1.06,0.00,0.00 + + +Participant name: +Michael Jones + +# +Participant organization: +NIH NHLBI + +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-NHLBI-TZVP-QM + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gaussian 09/16 +OpenBabel +MOE 2018 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + + +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +30 hours, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# +Computing and hardware: +All calculations were performed on a Biowulf and Lobos clusters at the National Institutes of Health. + + + + +# METHOD DESCRIPTION SECTION +# +Method: +Def2-TZVP basis sets were used for all calculations. All calculations were performed in either Gaussian 09 or Gaussian 16. All challenge molecules were converted to 3d coordinates using OpenBabel from the SMILES string. Structures were then optimized with the B3LYP density functional and were verified to be local minima via frequency calculations on an ‘ultrafine’ integration grid with harmonic frequencies. The transfer free energy was calculated by (\Delta G_water - \Delta G_octanol) and are reported in kcal/mol. + + +# +Ranked: +True + diff --git a/physical_property/logP/Analysis/Submissions/logp-nhlbi-2.csv b/physical_property/logP/Analysis/Submissions/logp-nhlbi-2.csv new file mode 100644 index 00000000..1e9cda02 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp-nhlbi-2.csv @@ -0,0 +1,97 @@ +# WATER-OCTANOL TRANSFER FREE ENERGY PREDICTIONS +# The energy units must be in kcal/mol. + + +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,2.46,0.78,0.98 +SM26,SM26_micro000,2.56,0.78,0.98 +SM27,SM27_micro000,3.24,0.78,0.98 +SM28,SM28_micro000,3.50,0.78,0.98 +SM29,SM29_micro000,1.54,0.78,0.98 +SM30,SM30_micro000,3.39,0.78,0.98 +SM31,SM31_micro000,3.66,0.78,0.98 +SM32,SM32_micro000,1.85,0.78,0.98 +SM33,SM33_micro000,3.26,0.78,0.98 +SM34,SM34_micro000,3.66,0.78,0.98 +SM35,SM35_micro000,2.79,0.78,0.98 +SM36,SM36_micro000,3.77,0.78,0.98 +SM37,SM37_micro000,4.41,0.78,0.98 +SM38,SM38_micro000,2.68,0.78,0.98 +SM39,SM39_micro000,3.33,0.78,0.98 +SM40,SM40_micro000,3.78,0.78,0.98 +SM41,SM41_micro000,2.82,0.78,0.98 +SM42,SM42_micro000,3.87,0.78,0.98 +SM43,SM43_micro000,3.61,0.78,0.98 +SM44,SM44_micro000,0.97,0.78,0.98 +SM45,SM45_micro000,1.89,0.78,0.98 +SM46,SM46_micro000,1.43,0.78,0.98 + +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Samar Prasad +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +NIH NHLBI + +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +TFE-NHLBI-NN-IN + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Tensorflow 1.10 +RDKIT Release 2018 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical`, `Empirical`, `Mixed`, or `Other`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + + +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.2 hours, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All calculations were performed on a Dell Desktop (16GB RAM, Quad core i7 processor). + + + + +# METHOD DESCRIPTION SECTION +# +Method: +We built a deep neural network model to predict log P values. The input vector contains consists of a 1024 size bit vector of Morgan Fingerprints. The dense NN contains 5 hidden layers with 512,512, 256, 128 and 64 units. Input training data was obtained from Popova et.al.(Science Advances 25 Jul 2018:Vol. 4, no. 7, eaap7885 DOI: 10.1126/sciadv.aap7885). 5-fold cross validation was performed on the training data.) This same model was employed in the SAMPl6 competition. Following the calculation of logP, the values were converted to kcal/mol. + +# +Ranked: +False + diff --git a/physical_property/logP/Analysis/Submissions/logp_DB1.csv b/physical_property/logP/Analysis/Submissions/logp_DB1.csv new file mode 100644 index 00000000..5cb93e65 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_DB1.csv @@ -0,0 +1,214 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-3.04,0.0,0.19 +SM26,SM26_micro000,-1.31,0.0,0.15 +SM27,SM27_micro000,-1.82,0.0,0.46 +SM28,SM28_micro000,-1.93,0.0,0.33 +SM29,SM29_micro000,-1.91,0.0,0.43 +SM30,SM30_micro000,-3.18,0.0,0.65 +SM31,SM31_micro000,-2.00,0.0,0.45 +SM32,SM32_micro000,-3.01,0.0,0.73 +SM33,SM33_micro000,-4.51,0.0,0.56 +SM34,SM34_micro000,-2.95,0.0,0.81 +SM35,SM35_micro000,-0.61,0.0,0.37 +SM36,SM36_micro000,-2.30,0.0,0.62 +SM37,SM37_micro000,-0.41,0.0,0.55 +SM38,SM38_micro000,-1.94,0.0,0.33 +SM39,SM39_micro000,-2.60,0.0,0.19 +SM40,SM40_micro000,-1.82,0.0,0.17 +SM41,SM41_micro000,-2.19,0.0,0.38 +SM42,SM42_micro000,-4.81,0.0,0.67 +SM43,SM43_micro000,-2.52,0.0,0.38 +SM44,SM44_micro000,-0.51,0.0,0.27 +SM45,SM45_micro000,-2.09,0.0,0.27 +SM46,SM46_micro000,-0.54,0.0,0.30 + + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Alex Dickson + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Michigan State University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +ClassicalGSG + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +52 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +We used the Michigan State University's High Performance Computing +Center (HPCC) resources for all computations. To generate molecular +features and train the models, we performed parallel tasks on a single +node, where 12 CPUs and 20 CPUs of model Intel(R) Xeon(R) CPU +E5-2680 v4 @ 2.40GHz CPUs were employed, respectively. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +CGenFF 2.3.0 +Openbabel 2.4.1 +RDKit 2019.09.1.0 +PyTorch 1.3.0 +Skorch 0.7.0 +Sklearn 0.21.3 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +The logP values here were predicted using an empirical method we refer +to as “ClassicalGSG”. It employs neural networks (NNs) where the +inputs are molecular features generated using a recently developed +method called Geometric Scattering for Graphs (GSG). In GSG, atomic +features are transformed into molecular features using the graph +molecular structure. For atomic features, all of our predictions use +four physical quantities from classical molecular dynamics +forcefields: partial charge, Lennard-Jones well depth, Lennard-Jones +radius and atomic type. The first three parameters are calculated +using the CGenFF package. The atomic type is the one-hot encoding of +an integer from 0 to 35 determined by a manual grouping of CGenFF atom +types. This implementation of the GSG method uses a concatenation of +the zeroth, first and second order features, calculated using a +wavelet number (J) of 4. In the neural networks, we performed a grid +search and five-fold cross validation to tune the following +hyperparameters: the number of hidden layers is tested for [2,3,4,5], +the size of hidden layer for [100, 200, 300, 400], and droput for +[0.0., 0.2, 0.4]. The loss function of the NNs is MSELOSS and the Adam +optimizer is used for optimizing the parameters. + +The full training set was built using the following datasets: +ALOGPS_3_01, Huuskonen, logpt_all_data_training, +Logpt_challenge_training, PhysProp, Guowei, Chembl21. The RDKit +package is employed to create canonical SMILES for each +molecule. After removing duplicate molecules, 44,595 molecules +remained in the dataset. We used Openbabel to convert SMILES to mol2 +files. These mol2 files are fed to CGenFF to determine partial charges +and Lennard-Jones parameters for all atoms in each molecule. As the +generation of CGenFF atomic attributes failed for some molecules, we +ended up with 41,409 molecules in our dataset, which we refer to below +as the “full dataset”. + + +DB1: The full training set was used for these predictions. + +DB2: In this prediction we use a training set of 3,482 molecules, +obtained by filtering the full training set and keeping only those +that had the following elements (C, N, O, S and H). Molecules that +either had another element, not listed above, or did not have the full +set of elements, were not selected. + +DB3: In this prediction we use a training set of 2,379 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. This was done using the +HasSubstructMatch function of RDKit tool. + +DB4: In this prediction we use a training set of 1, 482 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. Additionally, these were further +filtered to select only compounds with the following elements (C, N, +O, S, and H). Molecules that either had another element, not listed +above, or did not have the full set of elements, were not selected. + +Here the logP values are predicted by the model trained on DB1. The +model uncertainties are the standard deviation of the logP values are +predicted by the four different models trained on DB1, DB2, DB3, and +DB4. + + + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logp_DB2.csv b/physical_property/logP/Analysis/Submissions/logp_DB2.csv new file mode 100644 index 00000000..fe059795 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_DB2.csv @@ -0,0 +1,211 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-3.33,0.0,0.19 +SM26,SM26_micro000,-1.32,0.0,0.15 +SM27,SM27_micro000,-1.59,0.0,0.46 +SM28,SM28_micro000,-1.50,0.0,0.33 +SM29,SM29_micro000,-1.67,0.0,0.43 +SM30,SM30_micro000,-3.01,0.0,0.65 +SM31,SM31_micro000,-2.31,0.0,0.45 +SM32,SM32_micro000,-3.55,0.0,0.73 +SM33,SM33_micro000,-4.41,0.0,0.56 +SM34,SM34_micro000,-3.96,0.0,0.81 +SM35,SM35_micro000,-1.09,0.0,0.37 +SM36,SM36_micro000,-2.61,0.0,0.62 +SM37,SM37_micro000,-1.58,0.0,0.55 +SM38,SM38_micro000,-1.08,0.0,0.33 +SM39,SM39_micro000,-2.17,0.0,0.19 +SM40,SM40_micro000,-1.42,0.0,0.17 +SM41,SM41_micro000,-2.47,0.0,0.38 +SM42,SM42_micro000,-3.48,0.0,0.67 +SM43,SM43_micro000,-2.02,0.0,0.38 +SM44,SM44_micro000,-1.12,0.0,0.27 +SM45,SM45_micro000,-2.74,0.0,0.27 +SM46,SM46_micro000,-1.31,0.0,0.30 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Alex Dickson + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Michigan State University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +ClassicalGSG + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +8 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +We used the Michigan State University's High Performance Computing +Center (HPCC) resources for all computations. To generate molecular +features and train the models, we performed parallel tasks on a single +node, where 12 CPUs and 20 CPUs of model Intel(R) Xeon(R) CPU +E5-2680 v4 @ 2.40GHz CPUs were employed, respectively. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +CGenFF 2.3.0 +Openbabel 2.4.1 +RDKit 2019.09.1.0 +PyTorch 1.3.0 +Skorch 0.7.0 +Sklearn 0.21.3 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +The logP values here were predicted using an empirical method we refer +to as “ClassicalGSG”. It employs neural networks (NNs) where the +inputs are molecular features generated using a recently developed +method called Geometric Scattering for Graphs (GSG). In GSG, atomic +features are transformed into molecular features using the graph +molecular structure. For atomic features, all of our predictions use +four physical quantities from classical molecular dynamics +forcefields: partial charge, Lennard-Jones well depth, Lennard-Jones +radius and atomic type. The first three parameters are calculated +using the CGenFF package. The atomic type is the one-hot encoding of +an integer from 0 to 35 determined by a manual grouping of CGenFF atom +types. This implementation of the GSG method uses a concatenation of +the zeroth, first and second order features, calculated using a +wavelet number (J) of 4. In the neural networks, we performed a grid +search and five-fold cross validation to tune the following +hyperparameters: the number of hidden layers is tested for [2,3,4,5], +the size of hidden layer for [100, 200, 300, 400], and droput for +[0.0., 0.2, 0.4]. The loss function of the NNs is MSELOSS and the Adam +optimizer is used for optimizing the parameters. + +The full training set was built using the following datasets: +ALOGPS_3_01, Huuskonen, logpt_all_data_training, +Logpt_challenge_training, PhysProp, Guowei, Chembl21. The RDKit +package is employed to create canonical SMILES for each +molecule. After removing duplicate molecules, 44,595 molecules +remained in the dataset. We used Openbabel to convert SMILES to mol2 +files. These mol2 files are fed to CGenFF to determine partial charges +and Lennard-Jones parameters for all atoms in each molecule. As the +generation of CGenFF atomic attributes failed for some molecules, we +ended up with 41,409 molecules in our dataset, which we refer to below +as the “full dataset”. + +DB1: The full training set was used for these predictions. + +DB2: In this prediction we use a training set of 3,482 molecules, +obtained by filtering the full training set and keeping only those +that had the following elements (C, N, O, S and H). Molecules that +either had another element, not listed above, or did not have the full +set of elements, were not selected. + +DB3: In this prediction we use a training set of 2,379 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. This was done using the +HasSubstructMatch function of RDKit tool. + +DB4: In this prediction we use a training set of 1,482 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. Additionally, these were further +filtered to select only compounds with the following elements (C, N, +O, S, and H). Molecules that either had another element, not listed +above, or did not have the full set of elements, were not selected. + +Here the logP values are predicted by the model trained on DB4. The +model uncertainties are the standard deviation of the logP values are +predicted by the four different models trained on DB1, DB2, DB3, and +DB4. + + + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logp_DB3.csv b/physical_property/logP/Analysis/Submissions/logp_DB3.csv new file mode 100644 index 00000000..a0ea283d --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_DB3.csv @@ -0,0 +1,214 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-3.59,0.0,0.19 +SM26,SM26_micro000,-0.96,0.0,0.15 +SM27,SM27_micro000,-1.46,0.0,0.46 +SM28,SM28_micro000,-1.01,0.0,0.33 +SM29,SM29_micro000,-1.63,0.0,0.43 +SM30,SM30_micro000,-3.77,0.0,0.65 +SM31,SM31_micro000,-2.85,0.0,0.45 +SM32,SM32_micro000,-4.95,0.0,0.73 +SM33,SM33_micro000,-5.81,0.0,0.56 +SM34,SM34_micro000,-5.23,0.0,0.81 +SM35,SM35_micro000,-1.41,0.0,0.37 +SM36,SM36_micro000,-3.17,0.0,0.62 +SM37,SM37_micro000,-1.88,0.0,0.55 +SM38,SM38_micro000,-1.22,0.0,0.33 +SM39,SM39_micro000,-2.17,0.0,0.19 +SM40,SM40_micro000,-1.59,0.0,0.17 +SM41,SM41_micro000,-1.44,0.0,0.38 +SM42,SM42_micro000,-4.10,0.0,0.67 +SM43,SM43_micro000,-2.99,0.0,0.38 +SM44,SM44_micro000,-0.48,0.0,0.27 +SM45,SM45_micro000,-2.73,0.0,0.27 +SM46,SM46_micro000,-1.12,0.0,0.30 + + + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Alex Dickson + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Michigan State University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +ClassicalGSG + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +10 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +We used the Michigan State University's High Performance Computing +Center (HPCC) resources for all computations. To generate molecular +features and train the models, we performed parallel tasks on a single +node, where 12 CPUs and 20 CPUs of model Intel(R) Xeon(R) CPU +E5-2680 v4 @ 2.40GHz CPUs were employed, respectively. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +CGenFF 2.3.0 +Openbabel 2.4.1 +RDKit 2019.09.1.0 +PyTorch 1.3.0 +Skorch 0.7.0 +Sklearn 0.21.3 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +The logP values here were predicted using an empirical method we refer +to as “ClassicalGSG”. It employs neural networks (NNs) where the +inputs are molecular features generated using a recently developed +method called Geometric Scattering for Graphs (GSG). In GSG, atomic +features are transformed into molecular features using the graph +molecular structure. For atomic features, all of our predictions use +four physical quantities from classical molecular dynamics +forcefields: partial charge, Lennard-Jones well depth, Lennard-Jones +radius and atomic type. The first three parameters are calculated +using the CGenFF package. The atomic type is the one-hot encoding of +an integer from 0 to 35 determined by a manual grouping of CGenFF atom +types. This implementation of the GSG method uses a concatenation of +the zeroth, first and second order features, calculated using a +wavelet number (J) of 4. In the neural networks, we performed a grid +search and five-fold cross validation to tune the following +hyperparameters: the number of hidden layers is tested for [2,3,4,5], +the size of hidden layer for [100, 200, 300, 400], and droput for +[0.0., 0.2, 0.4]. The loss function of the NNs is MSELOSS and the Adam +optimizer is used for optimizing the parameters. + + +The full training set was built using the following datasets: +ALOGPS_3_01, Huuskonen, logpt_all_data_training, +Logpt_challenge_training, PhysProp, Guowei, Chembl21. The RDKit +package is employed to create canonical SMILES for each +molecule. After removing duplicate molecules, 44,595 molecules +remained in the dataset. We used Openbabel to convert SMILES to mol2 +files. These mol2 files are fed to CGenFF to determine partial charges +and Lennard-Jones parameters for all atoms in each molecule. As the +generation of CGenFF atomic attributes failed for some molecules, we +ended up with 41,409 molecules in our dataset, which we refer to below +as the “full dataset”. + +DB1: The full training set was used for these predictions. + +DB2: In this prediction we use a training set of 3,482 molecules, +obtained by filtering the full training set and keeping only those +that had the following elements (C, N, O, S and H). Molecules that +either had another element, not listed above, or did not have the full +set of elements, were not selected. + +DB3: In this prediction we use a training set of 2,379 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. This was done using the +HasSubstructMatch function of RDKit tool. + +DB4: In this prediction we use a training set of 1,482 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. Additionally, these were further +filtered to select only compounds with the following elements (C, N, +O, S, and H). Molecules that either had another element, not listed +above, or did not have the full set of elements, were not selected. + +Here the logP values are predicted by the model trained on DB3. The +model uncertainties are the standard deviation of the logP values are +predicted by the four different models trained on DB1, DB2, DB3, and +DB4. + + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logp_DB4.csv b/physical_property/logP/Analysis/Submissions/logp_DB4.csv new file mode 100644 index 00000000..50e76921 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_DB4.csv @@ -0,0 +1,212 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,-3.28,0.0,0.19 +SM26,SM26_micro000,-1.09,0.0,0.15 +SM27,SM27_micro000,-0.60,0.0,0.46 +SM28,SM28_micro000,-1.48,0.0,0.33 +SM29,SM29_micro000,-0.77,0.0,0.43 +SM30,SM30_micro000,-1.98,0.0,0.65 +SM31,SM31_micro000,-1.62,0.0,0.45 +SM32,SM32_micro000,-3.46,0.0,0.73 +SM33,SM33_micro000,-4.74,0.0,0.56 +SM34,SM34_micro000,-4.26,0.0,0.81 +SM35,SM35_micro000,-0.49,0.0,0.37 +SM36,SM36_micro000,-1.45,0.0,0.62 +SM37,SM37_micro000,-1.16,0.0,0.55 +SM38,SM38_micro000,-1.39,0.0,0.33 +SM39,SM39_micro000,-2.14,0.0,0.19 +SM40,SM40_micro000,-1.83,0.0,0.17 +SM41,SM41_micro000,-2.02,0.0,0.38 +SM42,SM42_micro000,-3.02,0.0,0.67 +SM43,SM43_micro000,-2.13,0.0,0.38 +SM44,SM44_micro000,-0.90,0.0,0.27 +SM45,SM45_micro000,-2.53,0.0,0.27 +SM46,SM46_micro000,-1.21,0.0,0.30 + + + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Alex Dickson + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Michigan State University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +ClassicalGSG + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +240 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: + +We used the Michigan State University's High Performance Computing +Center (HPCC) resources for all computations. To generate molecular +features and train the models, we performed parallel tasks on a single +node, where 12 CPUs and 20 CPUs of model Intel(R) Xeon(R) CPU +E5-2680 v4 @ 2.40GHz CPUs were employed, respectively. + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +CGenFF 2.3.0 +Openbabel 2.4.1 +RDKit 2019.09.1.0 +PyTorch 1.3.0 +Skorch 0.7.0 +Sklearn 0.21.3 + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: + +The logP values here were predicted using an empirical method we refer +to as “ClassicalGSG”. It employs neural networks (NNs) where the +inputs are molecular features generated using a recently developed +method called Geometric Scattering for Graphs (GSG). In GSG, atomic +features are transformed into molecular features using the graph +molecular structure. For atomic features, all of our predictions use +four physical quantities from classical molecular dynamics +forcefields: partial charge, Lennard-Jones well depth, Lennard-Jones +radius and atomic type. The first three parameters are calculated +using the CGenFF package. The atomic type is the one-hot encoding of +an integer from 0 to 35 determined by a manual grouping of CGenFF atom +types. This implementation of the GSG method uses a concatenation of +the zeroth, first and second order features, calculated using a +wavelet number (J) of 4. In the neural networks, we performed a grid +search and five-fold cross validation to tune the following +hyperparameters: the number of hidden layers is tested for [2,3,4,5], +the size of hidden layer for [100, 200, 300, 400], droput for [0.0., +0.2, 0.4] and the learning rate for 100 uniform numbers from -5 to +-2.5. The loss function of the NNs is MSELOSS and the Adam optimizer +is used for optimizing the parameters. + +The full training set was built using the following datasets: +ALOGPS_3_01, Huuskonen, logpt_all_data_training, +Logpt_challenge_training, PhysProp, Guowei, Chembl21. The RDKit +package is employed to create canonical SMILES for each +molecule. After removing duplicate molecules, 44,595 molecules +remained in the dataset. We used Openbabel to convert SMILES to mol2 +files. These mol2 files are fed to CGenFF to determine partial charges +and Lennard-Jones parameters for all atoms in each molecule. As the +generation of CGenFF atomic attributes failed for some molecules, we +ended up with 41,409 molecules in our dataset, which we refer to below +as the “full dataset”. + +DB1: The full training set was used for these predictions. + +DB2: In this prediction we use a training set of 3,482 molecules, +obtained by filtering the full training set and keeping only those +that had the following elements (C, N, O, S and H). Molecules that +either had another element, not listed above, or did not have the full +set of elements, were not selected. + +DB3: In this prediction we use a training set of 2,379 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. This was done using the +HasSubstructMatch function of RDKit tool. + +DB4: In this prediction we use a training set of 1,482 molecules, +obtained by filtering the full training set and keeping only those +with sulfonyl functional groups. Additionally, these were further +filtered to select only compounds with the following elements (C, N, +O, S, and H). Molecules that either had another element, not listed +above, or did not have the full set of elements, were not selected. + +Here the logP values were predicted by the model trained on DB4. The +model uncertainties are the standard deviation of the logP values are +predicted by the four different models trained on DB1, DB2, DB3, and +DB4. +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model1.csv b/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model1.csv new file mode 100644 index 00000000..92bedca5 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model1.csv @@ -0,0 +1,163 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,2.47,0.38,0.44 +SM26,SM26_micro000,1.43,0.32,0.44 +SM27,SM27_micro000,1.66,0.36,0.44 +SM28,SM28_micro000,1.84,0.23,0.44 +SM29,SM29_micro000,1.68,0.44,0.44 +SM30,SM30_micro000,2.44,0.51,0.44 +SM31,SM31_micro000,1.95,0.36,0.44 +SM32,SM32_micro000,2.09,0.42,0.44 +SM33,SM33_micro000,2.96,0.44,0.44 +SM34,SM34_micro000,2.2,0.54,0.44 +SM35,SM35_micro000,1.8,0.42,0.44 +SM36,SM36_micro000,2.52,0.43,0.44 +SM37,SM37_micro000,1.92,0.46,0.44 +SM38,SM38_micro000,1.81,0.31,0.44 +SM39,SM39_micro000,2.51,0.4,0.44 +SM40,SM40_micro000,1.91,0.4,0.44 +SM41,SM41_micro000,1.49,0.28,0.44 +SM42,SM42_micro000,2.22,0.43,0.44 +SM43,SM43_micro000,1.81,0.42,0.44 +SM44,SM44_micro000,1.35,0.36,0.44 +SM45,SM45_micro000,2.01,0.32,0.44 +SM46,SM46_micro000,1.57,0.21,0.44 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Jae Hong Shin + + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +NetTargets Inc. + + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +Ensemble prediction of TFE + + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +24 hours, CPU +10 hours, GPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The model trains took about 24 hours including features selection for random forest, XGBoost, and 1D-CNN methods respectively. +prediction times took less than a few seconds since we applied empirical methods. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +R for Random forest method +Tensorflow for 1D-CNN + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +The present study attempts to predict the logp for 22 compounds as part of SAMPL7 challenge. +Our workflow consists of four steps as follows: 1)data collection and molecular descriptor calculation, 2)feature extraction, 3)prediction model build, and 4)ensemble prediction. To build prediction models, we applied machine learning approaches such as random forest, XGBoost, and 1D-CNN methods. We also applied ensembl predictions by combining three prediction models. In order to build prediction models, we calculated various molecular descriptors including physicochemical, topological descriptors, and molecular fingerprints. +Data collection and molecular descriptor calculation +We collected 2 different datasets which are Martel set (n=707) and PHYSPROP(n=11657) and calculated more than 4000 molecular descriptors may represent logp values. +We applied a meta learning model which utilized already existing logp prediction values, such as Xlogp, Alogp, etc. +Feature selection +Since we calculated more than 4000 molecular descriptors, it is better to select significant descriptors for appropriately representing logp as well as avoiding model overfitting. We recursively eliminate less significant features with the generation of multiple random forest models in each iteration. We obtained 101 features for Martel dataset and 190 features for PHYSPROP dataset to build predictive models. +Model Construction +We applied an ensemble prediction employing three different prediction models, established by random forest, XGBoost, and 1D-CNN respectively. We also applied 5 fold cross validation and each machine learning model predicts the logp values by averaging the five models from each fold. +This model is called, "model1" built by Martel dataset. +Unit conversion +As our model calculates logs we convert it to kcal/mol unit by using the equation: logpow <- ((deltaGw - deltaGo)/(RT))*loge + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model2.csv b/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model2.csv new file mode 100644 index 00000000..73d15cf7 --- /dev/null +++ b/physical_property/logP/Analysis/Submissions/logp_ensemble_logp_model2.csv @@ -0,0 +1,164 @@ +# OCTANOL TO WATER (ΔG_octanol - ΔG_water) TRANSFER FREE ENERGY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# +# PREDICTION SECTION +# +# It is mandatory to submit water to octanol (ΔG_octanol - ΔG_water) transfer free energy (TFE) predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# The energy units must be in kcal/mol. + +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section (e.g. `SM25_micro000`, SM25_extra001`) +# Please report TFE standard error of the mean (SEM) and TFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), TFE, TFE SEM, TFE model uncertainty, (optional) logD, (optional) SEM logD +# +# Your transfer free energy prediction for the neutral form does NOT have to be `SMXX_micro000` (which is the challenge provided neutral microstate). +# If you use a microstate other than the challenge provided microstate, please fill out the `Molecule ID/IDs considered (no commas)` section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods description in the `METHOD DESCRIPTION SECTION`. +# +# You may optionally provide predicted logD values; these will be used as a consistency check on our estimated logD values if you submit both logP and pKa values. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25_micro000,1.99,0.08,0.35 +SM26,SM26_micro000,0.72,0.1,0.35 +SM27,SM27_micro000,1,0.15,0.35 +SM28,SM28_micro000,1.11,0.24,0.35 +SM29,SM29_micro000,0.82,0.03,0.35 +SM30,SM30_micro000,1.7,0.12,0.35 +SM31,SM31_micro000,0.75,0.09,0.35 +SM32,SM32_micro000,1.42,0.3,0.35 +SM33,SM33_micro000,2.26,0.35,0.35 +SM34,SM34_micro000,1.45,0.22,0.35 +SM35,SM35_micro000,0.57,0.27,0.35 +SM36,SM36_micro000,1.59,0.32,0.35 +SM37,SM37_micro000,0.65,0.07,0.35 +SM38,SM38_micro000,0.76,0.17,0.35 +SM39,SM39_micro000,1.64,0.21,0.35 +SM40,SM40_micro000,0.73,0.15,0.35 +SM41,SM41_micro000,1.07,0.08,0.35 +SM42,SM42_micro000,1.91,0.24,0.35 +SM43,SM43_micro000,1.07,0.04,0.35 +SM44,SM44_micro000,0.34,0.07,0.35 +SM45,SM45_micro000,1.15,0.02,0.35 +SM46,SM46_micro000,0.46,0.1,0.35 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Jae Hong Shin + + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +NetTargets Inc. + + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +Ensemble prediction of TFE + + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +24 hours, CPU +10 hours, GPU + + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The model trains took about 24 hours including features selection for random forest, XGBoost, and 1D-CNN methods respectively. +prediction times took less than a few seconds since we applied empirical methods. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +R for Random forest method +Tensorflow for 1D-CNN + + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +The present study attempts to predict the logp for 22 compounds as part of SAMPL7 challenge. +Our workflow consists of four steps as follows: 1)data collection and molecular descriptor calculation, 2)feature extraction, 3)prediction model build, and 4)ensemble prediction. To build prediction models, we applied machine learning approaches such as random forest, XGBoost, and 1D-CNN methods. We also applied ensembl predictions by combining three prediction models. In order to build prediction models, we calculated various molecular descriptors including physicochemical, topological descriptors, and +molecular fingerprints. +Data collection and molecular descriptor calculation +We collected 2 different datasets which are Martel set (n=707) and PHYSPROP(n=11657) and calculated more than 4000 molecular descriptors may represent logp values. +We applied a meta learning model which utilized already existing logp prediction values, such as Xlogp, Alogp, etc. +Feature selection +Since we calculated more than 4000 molecular descriptors, it is better to select significant descriptors for appropriately representing logp as well as avoiding model overfitting. We recursively eliminate less significant features with the generation of multiple random forest models in each iteration. We obtained 101 features for Martel dataset and 190 features for PHYSPROP dataset to build predictive models. +Model Construction +We applied an ensemble prediction employing three different prediction models, established by random forest, XGBoost, and 1D-CNN respectively. We also applied 5 fold cross validation and each machine learning model predicts the logp values by averaging the five models from each fold. +This model is called, "model2" built by EPI suite's physprop dataset. +Unit conversion +As our model calculates logs we convert it to kcal/mol unit by using the equation: logpow <- ((deltaGw - deltaGo)/(RT))*loge + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +False diff --git a/physical_property/pKa/Analysis/SAMPL7-user-map-HG.csv b/physical_property/pKa/Analysis/SAMPL7-user-map-HG.csv new file mode 100644 index 00000000..1f8aa47f --- /dev/null +++ b/physical_property/pKa/Analysis/SAMPL7-user-map-HG.csv @@ -0,0 +1,10 @@ +11,pKa-VA-2.csv +12,pKa-ECRISM-1.csv +13,pKa-IEFPCMMST-1.csv +14,pKa-RobertRaddi.csv +15,pKa_prediction_Iorga_Beckstein_1.csv +16,pka-nhlbi-1.csv +17,pKa_RodriguezPaluch_SMD_1.csv +18,pKa_RodriguezPaluch_SMD_2.csv +19,pKa_RodriguezPaluch_SMD_3.csv +20,pka-nhlbi-1_L0OUNi2.csv diff --git a/physical_property/pKa/Analysis/Scripts/get_usermap.py b/physical_property/pKa/Analysis/Scripts/get_usermap.py new file mode 100644 index 00000000..f965035c --- /dev/null +++ b/physical_property/pKa/Analysis/Scripts/get_usermap.py @@ -0,0 +1,16 @@ +#!/bin/env python + +outfile = '../SAMPL7-user-map-HG.csv' + +# Read user map from submission server +file = open('/Users/dmobley/github/SAMPL-submission-systems/SAMPL-submission-handling-shared/submissions/downloads/submission_table.txt', 'r') +text = file.readlines() +file.close() + +# Write output file, removing e-mail addresses +file = open(outfile, 'w') +for line in text: + tmp = line.split(',') + if 'PKA' in tmp[2].upper(): + file.write(f'{tmp[0].strip()},{tmp[2].strip().replace(" ","_")}\n') +file.close() diff --git a/physical_property/pKa/Analysis/Submissions/pKa-ECRISM-1.csv b/physical_property/pKa/Analysis/Submissions/pKa-ECRISM-1.csv new file mode 100644 index 00000000..91af07cb --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa-ECRISM-1.csv @@ -0,0 +1,141 @@ +Predictions: +SM25_micro000,SM25_micro003,-1,7.91,0.01,1.04 +SM25_micro000,SM25_micro001,-1,-6.66,0.01,1.04 +SM25_micro000,SM25_micro002,0,-7.52,0.01,1.04 +SM25_micro000,SM25_micro004,0,-12.08,0.01,1.04 +SM25_micro000,SM25_micro005,1,-2.33,0.01,1.04 +SM26_micro000,SM26_micro001,-1,5.53,0.01,1.04 +SM26_micro000,SM26_micro003,-1,19.92,0.01,1.04 +SM26_micro000,SM26_micro002,0,12.59,0.01,1.04 +SM26_micro000,SM26_micro004,0,4.82,0.01,1.04 +SM26_micro000,SM26_micro005,1,9.95,0.01,1.04 +SM27_micro000,SM27_micro001,-1,10.17,0.01,1.04 +SM28_micro000,SM28_micro002,-1,13.95,0.01,1.04 +SM28_micro000,SM28_micro001,0,14.75,0.01,1.04 +SM28_micro000,SM28_micro004,-1,31.52,0.01,1.04 +SM28_micro000,SM28_micro003,1,6.75,0.01,1.04 +SM29_micro000,SM29_micro001,-1,9.88,0.01,1.04 +SM30_micro000,SM30_micro001,-1,9.4,0.01,1.04 +SM31_micro000,SM31_micro001,-1,11.15,0.01,1.04 +SM31_micro000,SM31_micro002,1,1.88,0.01,1.04 +SM32_micro000,SM32_micro001,-1,10.25,0.01,1.04 +SM33_micro000,SM33_micro001,-1,9.8,0.01,1.04 +SM34_micro000,SM34_micro001,-1,10.4,0.01,1.04 +SM34_micro000,SM34_micro002,1,1.08,0.01,1.04 +SM35_micro000,SM35_micro001,-1,9.68,0.01,1.04 +SM35_micro000,SM35_micro003,-1,9.68,0.01,1.04 +SM35_micro000,SM35_micro002,0,0.21,0.01,1.04 +SM36_micro000,SM36_micro001,-1,9.4,0.01,1.04 +SM36_micro000,SM36_micro003,-1,9.4,0.01,1.04 +SM36_micro000,SM36_micro002,0,-0.01,0.01,1.04 +SM37_micro000,SM37_micro002,-1,10.03,0.01,1.04 +SM37_micro000,SM37_micro004,-1,10.03,0.01,1.04 +SM37_micro000,SM37_micro003,0,0.19,0.01,1.04 +SM37_micro000,SM37_micro001,1,3.59,0.01,1.04 +SM37_micro000,SM37_micro005,1,0.58,0.01,1.04 +SM38_micro000,SM38_micro001,-1,9.31,0.01,1.04 +SM39_micro000,SM39_micro001,-1,8.45,0.01,1.04 +SM40_micro000,SM40_micro001,-1,9.4,0.01,1.04 +SM40_micro000,SM40_micro002,1,2.7,0.01,1.04 +SM41_micro000,SM41_micro001,-1,5.74,0.01,1.04 +SM41_micro000,SM41_micro002,1,5.07,0.01,1.04 +SM42_micro000,SM42_micro001,-1,0.54,0.01,1.04 +SM42_micro000,SM42_micro002,1,0.3,0.01,1.04 +SM42_micro000,SM42_micro003,0,-5.05,0.01,1.04 +SM43_micro000,SM43_micro001,-1,1.31,0.01,1.04 +SM43_micro000,SM43_micro004,0,-5.21,0.01,1.04 +SM43_micro000,SM43_micro005,1,-0.19,0.01,1.04 +SM43_micro000,SM43_micro002,1,-2.02,0.01,1.04 +SM43_micro000,SM43_micro003,2,7.57,0.01,1.04 +SM44_micro000,SM44_micro001,-1,6.32,0.01,1.04 +SM44_micro000,SM44_micro002,1,2.33,0.01,1.04 +SM45_micro000,SM45_micro001,-1,6.05,0.01,1.04 +SM45_micro000,SM45_micro002,1,2.25,0.01,1.04 +SM46_micro000,SM46_micro001,-1,6.52,0.01,1.04 +SM46_micro000,SM46_micro002,1,2.56,0.01,1.04 +SM46_micro000,SM46_micro004,1,2.4,0.01,1.04 +SM46_micro000,SM46_micro003,2,8.84,0.01,1.04 + +Participant name: +Stefan M. Kast, Nicolas Tielker + +Participant organization: +TU Dortmund University + +Name: +EC_RISM + +Compute time: +174 hours, CPU + +Computing and hardware: +All calculations were conducted on the LiDO 3 high performance cluster of TU Dortmund University. Calculations were automatically scheduled and ran on either an Intel Xeon E5-4604v4 or an Intel Xeon E5-2640v4 CPU, depending on node availability. + +Software: +Corina 4.3.0 +Gaussian 09 Rev E.01 +Gaussian 16 Rev C.01 +3D RISM (inhouse development) +EC-RISM (inhouse development) +Python 3.6 +Anaconda2018.12 +Amber 12 +Mathematica 12.0 (Wolfram) + +Category: +Physical (QM) + +Method: +For microstates with multiple possible stereoisomers these were generated using Corina. +50 geometries, or 200 for molecules containing more than seven rotatable bonds, were generated for each microstate using the EmbedMultipleConfs function of RDKit. These structures were pre-optimized with Amber 12 using GAFF 1.7 parameters and AM1-BCC charges with an ALPB model to represent the dielectric environment of water. +Conformations with an energy of more than 20 kcal/mol than the minimum structure of that microstate were discarded and the remaining structures clustered with a structural RMSD of 0.5 Angstrom. The cluster representatives were then optimized using Gaussian 16revC01 with IEF-PCM using default settings for water at the B3LYP/6-311+G(d,p) level of theory. +Additional stereoisomers were treated as if they were additional conformational states of the same microstate so that for each microsate only up to 5 conformations with the lowest PCM energies for each solvent were treated with EC-RISM/MP2/6-311+G(d,p) using the PSE2 closure [REF1] and the resulting EC-RISM energies corrected using (c1*mu_{ex}+c2*PMV_{EC-RISM}+c3*q). The correction for water has a fixed parameter c1 = 1 since this additional parameter was found to be of no predicitve value in previous challenges, c2 = -0.1025 kcal*mol^-1*A^-3 and c3 = -15.7284 kcal mol^-1 e^-1. [REF2]. These yield G_{mtc} where m refers to the ionization state, t to the tautomer (microstate per ionization state) and c to the respective conformation. The different statistical weights of conformations were taken into account by computing the free energy of the resulting discrete partition function G_{mt}=-RT*ln[sum_c(exp(-G_{mtc}/RT))]. For the deprotonation process of a microstate with k protons, pKa_{raw}=(G_{k}-G_{k-1})/(RT ln(10)) and the macrostate pKa follows from m*pKa_{raw}+b where parameters m and b are calculated within the chosen level of theory by fitting to the reference data set from [REF3]. Here, m = 0.7449 and b = -150.7196. To calculate the relative free energies with respect to each micro000, four different formulas must be used, depending on the difference in the protonation state. If there is no difference in the protonation state, i.e. the microstates are tautomers, the relative free energy is calculated via m*(G(micro000)-G(m2)) to achieve thermodynamically consistent cycles. Since micro000 is always a neutral species for a single deprotonation the relative free energy is calculated using m*(G(micro000)-G(m2))+b, and -(m*(G(micro000)-G(m2))+b) for a protonation process. For the doubly protonated species the calculated relative free energy is -(m*(G(micro000)-G(m2))+2*b). Macrostate pKa values were calculated using the partition function approach of Eq. 5 in [REF4]. +The SEM was estimated as the convergence criterion for a single EC-RISM calculation. The uncertainty was estimated as the RMSE from the pKa training set. + +References: +REF1: N. Tielker, D. Tomazic, J. Heil, T. Kloss, S. Ehrhart, S. Guessregen, K. F. Schmidt, S. M. Kast, J. Comput.-Aided Mol. Des. 30, 1035-1044 (2016). +REF2: N. Tielker, L. Eberlein, S. Guessregen, S. M. Kast, J. Comput.-Aided Mol. Des. 32, 1151-1163 (2018). +REF3: Klicic, J. J., Friesner, R. A., Liu, S., Guida, W. C., J. Phys. Chem. A 106, 1327-1335 (2002). +REF4: N. Tielker, L. Eberlein, C. Chodun, S. Guessregen, S. M. Kast, J. Mol. Model. 25, 139 (2019). + +Macro pKa values: +SM25,5.42,-1.,0. +SM25,-9.75,0.,1. +SM26,5.53,-1.,0. +SM26,-9.95,0.,1. +SM27,10.17,-1.,0. +SM28,13.95,-1.,0. +SM28,-6.75,0.,1. +SM29,9.88,-1.,0. +SM30,9.4,-1.,0. +SM31,11.15,-1.,0. +SM31,-1.88,0.,1. +SM32,10.25,-1.,0. +SM33,9.8,-1.,0. +SM34,10.4,-1.,0. +SM34,-1.08,0.,1. +SM35,9.59,-1.,0. +SM36,9.41,-1.,0. +SM37,9.94,-1.,0. +SM37,-0.72,0.,1. +SM38,9.31,-1.,0. +SM39,8.45,-1.,0. +SM40,9.4,-1.,0. +SM40,-2.7,0.,1. +SM41,5.74,-1.,0. +SM41,-5.07,0.,1. +SM42,5.59,-1.,0. +SM42,-5.35,0.,1. +SM43,6.52,-1.,0. +SM43,-3.2,0.,1. +SM43,-9.59,1.,2. +SM44,6.32,-1.,0. +SM44,-2.33,0.,1. +SM45,6.05,-1.,0. +SM45,-2.25,0.,1. +SM46,6.52,-1.,0. +SM46,-2.25,0.,1. +SM46,-6.59,1.,2. + +Ranked: +True diff --git a/physical_property/pKa/Analysis/Submissions/pKa-IEFPCMMST-1.csv b/physical_property/pKa/Analysis/Submissions/pKa-IEFPCMMST-1.csv new file mode 100644 index 00000000..85cd5772 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa-IEFPCMMST-1.csv @@ -0,0 +1,202 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit relative free energy (RFE) predictions for all 22 molecules. Incomplete submissions will not be accepted. +# Please report RFE standard error of the mean (SEM) and RFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty +# +# If you have evaluated additional microstates, include the following: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty, SMILES string of the predicted microstate +# +# The molecule ID of the other microstate needs to be in the format: `SMXX_extra001` (number can vary) +# Also email us the `.mol2` file of your microstate with explicit hydrogens and correct bond orders, +# Please send the `.mol2` file to the email listed on the instructions page. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25_micro000,SM25_micro001,-1,16.31,0.00,1.45 +SM25_micro000,SM25_micro002,0,14.57,0.00,1.45 +SM25_micro000,SM25_micro003,-1,9.85,0.00,1.45 +SM26_micro000,SM26_micro001,-1,6.15,0.00,1.45 +SM26_micro000,SM26_micro002,0,22.09,0.00,1.45 +SM26_micro000,SM26_micro003,-1,32.35,0.00,1.45 +SM27_micro000,SM27_micro001,-1,16.78,0.00,1.45 +SM28_micro000,SM28_micro001,0,23.52,0.00,1.45 +SM28_micro000,SM28_micro002,-1,21.92,0.00,1.45 +SM28_micro000,SM28_micro003,1,9.38,0.00,1.45 +SM29_micro000,SM29_micro001,-1,15.65,0.00,1.45 +SM30_micro000,SM30_micro001,-1,14.96,0.00,1.45 +SM31_micro000,SM31_micro001,-1,14.74,0.00,1.45 +SM32_micro000,SM32_micro001,-1,16.25,0.00,1.45 +SM33_micro000,SM33_micro001,-1,14.54,0.00,1.45 +SM34_micro000,SM34_micro001,-1,14.81,0.00,1.45 +SM35_micro000,SM35_micro001,-1,13.98,0.00,1.45 +SM35_micro000,SM35_micro002,0,0.00,0.00,1.45 +SM35_micro000,SM35_micro003,-1,13.98,0.00,1.45 +SM36_micro000,SM36_micro001,-1,12.51,0.00,1.45 +SM36_micro000,SM36_micro002,0,0.00,0.00,1.45 +SM36_micro000,SM36_micro003,-1,12.51,0.00,1.45 +SM37_micro000,SM37_micro001,1,5.37,0.00,1.45 +SM37_micro000,SM37_micro002,-1,11.03,0.00,1.45 +SM37_micro000,SM37_micro003,0,0.00,0.00,1.45 +SM37_micro000,SM37_micro004,-1,5.37,0.00,1.45 +SM37_micro000,SM37_micro005,1,11.03,0.00,1.45 +SM38_micro000,SM38_micro001,-1,13.36,0.00,1.45 +SM39_micro000,SM39_micro001,-1,12.04,0.00,1.45 +SM40_micro000,SM40_micro001,-1,11.23,0.00,1.45 +SM40_micro000,SM40_micro002,1,8.57,0.00,1.45 +SM41_micro000,SM41_micro001,-1,6.98,0.00,1.45 +SM41_micro000,SM41_micro002,1,13.86,0.00,1.45 +SM42_micro000,SM42_micro001,-1,6.61,0.00,1.45 +SM42_micro000,SM42_micro002,1,14.25,0.00,1.45 +SM43_micro000,SM43_micro001,-1,6.02,0.00,1.45 +SM43_micro000,SM43_micro002,1,9.14,0.00,1.45 +SM43_micro000,SM43_micro003,2,28.22,0.00,1.45 +SM44_micro000,SM44_micro001,-1,9.64,0.00,1.45 +SM44_micro000,SM44_micro002,1,6.19,0.00,1.45 +SM45_micro000,SM45_micro001,-1,10.02,0.00,1.45 +SM45_micro000,SM45_micro002,1,7.43,0.00,1.45 +SM46_micro000,SM46_micro001,-1,7.56,0.00,1.45 +SM46_micro000,SM46_micro002,1,6.75,0.00,1.45 +SM46_micro000,SM46_micro003,2,19.79,0.00,1.45 +SM46_micro000,SM46_micro004,1,11.25,0.00,1.45 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +CBDD Group +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +University of Barcelona and University of Costa Rica + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name of the method should not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +pKa-prediction-method-IEFPCM/MST + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +1447 hour, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +The conformational ensembles were built in the RPBS Web Portal (https://mobyle.rpbs.univ-paris-diderot.fr/cgi-bin/portal.py#forms::Frog2) using the FRee Online druG conformation generation (Frog2). Quamtum mechanics computations were run on the Consorci de Serveis Universitaris de Catalunya (CSUC). + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Software Frog 2.14 +Software Gaussian 16 + +# METHOD CATEGORY SECTION +# +# State if your prediction method is better classified as an +# experimental database lookup (DL), linear free energy relationship (LFER), +# quantitative structure-property relationship or machine learning (QSPR/ML), +# quantum mechanics without empirical correction (QM) model, quantum mechanics with +# linear empirical correction (QM+LEC), and combined quantum mechanics and molecular +# mechanics (QM+MM), or Other, using the following category labels: +# `DL`, `LFER`, `QSPR/ML`, `QM`, `QM+LEC`, `QM+MM` or `Other`. +# +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# Use as many lines of text as you need. +# +# We strongly encourage you to submit your predicted macro pKa values in this section in consecutive lines following this format: +# compound name, macro pKa, initial formal charge, formal charge after transition, e.g.: +# SM25, 3.5, 0, +1 +# This will allow us to check that our analysis of your free energies leads to the same endpoint as your analysis. +# +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Here, we have used the Frog 2.14 software to explore the conformational preferences of all microstates for the Sampl7 molecules. The molecular geometries of the compounds were fully optimized at the B3LYP/6-31G(d) level of theory, taking into account the solvation effect of water on the geometrical parameters of the solutes, using the IEFPCM version of the MST +model. The resulting minima were verified by vibrational frequency analysis, which gave positive frequencies in all cases. Then, the relative energies of the whole set of conformational species were refined from single-point computations performed at the MP2/aug-cc-pVDZ levels of theory. In addition, the gas phase estimate of the free energy difference for all microstates was derived by combining the MP2 enegies with zero point energy corrections (ZPE). Finally, solvation effects were added by using the B3LYP/6-31G(d) version of the IEFPCM/MST model, which is a quantum mechanical (QM) self-consistent continuum solvation method. +The pKa was determined using both, the experimental hydration free energy of the proton (-270.28 kcal/mol) and a Boltzmann’s weighting scheme to the relative stabilities of the conformational species determined for the microstates involved in the equilibrium constant for the dissociation reaction following the thermodynamic cycle reported in previous studies [Brown TN, Mora Diez N. Computational determination of aqueous (pKa) values of protonated benzimidazoles (part 1). J. Phys. Chem. B. 110(18), 9270–9279 (2006)]. The uncertainties of our method is reported according our previous work Pérez-Areales FJ, Betari N, Viayna A, Pont C, Espargaró A, Bartolini M, De Simone A, Rinaldi Alvarenga JF, Pérez B, Sabate R, Lamuela-Raventós RM, Andrisano V, Luque FJ, Muñoz-Torrero D. Design, synthesis and multitarget biological profiling of second-generation anti-Alzheimer rhein-huprine hybrids. Future Med Chem. 2017 Jun;9(10):965-981. doi: 10.4155/fmc-2017-0049] + +Macro pKa values +SM25,7.24,0,-1 +SM26,4.52,0,-1 +SM27,12.34,0,-1 +SM28,16.12,0,-1 +SM28,-6.90,0,+1 +SM29,11.51,0,-1 +SM30,11.00,0,-1 +SM31,10.84,0,-1 +SM32,11.95,0,-1 +SM33,10.69,0,-1 +SM34,10.64,0,-1 +SM35,10.28,0,-1 +SM36,9.20,0,-1 +SM37,-3.95,0,+1 +SM37,8.11,0,-1 +SM38,9.82,0,-1 +SM39,8.85,0,-1 +SM40,8.26,0,-1 +SM40,-6.30,0,+1 +SM41,5.13,0,-1 +SM41,-10.19,0,+1 +SM42,4.86,0,-1 +SM42,-10.48,0,+1 +SM43,4.43,0,-1 +SM43,-6.72,0,1 +SM43,-14.03,+1,+2 +SM44,7.09,0,-1 +SM44,-4.55,0,+1 +SM45,7.37,0,-1 +SM45,-5.46,0,+1 +SM46,5.56,0,-1 +SM46,-4.96,0,+1 +SM46,-9.59,+1,+2 + +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True + diff --git a/physical_property/pKa/Analysis/Submissions/pKa-RobertRaddi.csv b/physical_property/pKa/Analysis/Submissions/pKa-RobertRaddi.csv new file mode 100644 index 00000000..3b405067 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa-RobertRaddi.csv @@ -0,0 +1,194 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit relative free energy (RFE) predictions for all 22 molecules. Incomplete submissions will not be accepted. +# Please report RFE standard error of the mean (SEM) and RFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty +# +# If you have evaluated additional microstates, include the following: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty, SMILES string of the predicted microstate +# +# The molecule ID of the other microstate needs to be in the format: `SMXX_extra001` (number can vary) +# Also email us the `.mol2` file of your microstate with explicit hydrogens and correct bond orders, +# Please send the `.mol2` file to the email listed on the instructions page. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25_micro000,SM25_micro001,-1,16.33,2.22,9.39 +SM25_micro000,SM25_micro002,0,8.09,2.22,4.71 +SM25_micro000,SM25_micro003,-1,8.19,2.22,4.71 +SM26_micro000,SM26_micro001,-1,7.85,2.22,3.42 +SM26_micro000,SM26_micro002,0,-7.79,2.22,4.71 +SM26_micro000,SM26_micro003,-1,0.87,2.22,5.88 +SM27_micro000,SM27_micro001,-1,8.01,2.22,4.71 +SM28_micro000,SM28_micro001,0,-7.82,2.22,4.71 +SM28_micro000,SM28_micro002,-1,8.16,2.22,3.72 +SM28_micro000,SM28_micro003,+1,-5.18,2.22,4.0 +SM29_micro000,SM29_micro001,-1,8.02,2.22,4.71 +SM30_micro000,SM30_micro001,-1,8.06,2.22,4.71 +SM31_micro000,SM31_micro001,-1,8.02,2.22,4.71 +SM31_micro000,SM31_micro002,+1,-8.07,2.22,4.7 +SM32_micro000,SM32_micro001,-1,8.02,2.22,4.71 +SM33_micro000,SM33_micro001,-1,8.08,2.22,4.71 +SM34_micro000,SM34_micro001,-1,8.06,2.22,4.71 +SM34_micro000,SM34_micro002,+1,-8.05,2.22,4.7 +SM35_micro000,SM35_micro001,-1,8.08,2.22,4.71 +SM35_micro000,SM35_micro002,0,0.03,2.22,6.66 +SM35_micro000,SM35_micro003,-1,8.02,2.22,4.71 +SM36_micro000,SM36_micro001,-1,8.07,2.22,4.71 +SM36_micro000,SM36_micro002,0,0.0,2.22,6.66 +SM36_micro000,SM36_micro003,-1,8.06,2.22,4.71 +SM37_micro000,SM37_micro001,+1,-8.01,2.22,4.69 +SM37_micro000,SM37_micro002,-1,8.1,2.22,4.71 +SM37_micro000,SM37_micro003,0,-0.01,2.22,6.66 +SM37_micro000,SM37_micro004,-1,8.09,2.22,4.71 +SM37_micro000,SM37_micro005,+1,-8.07,2.22,4.71 +SM38_micro000,SM38_micro001,-1,8.09,2.22,4.71 +SM39_micro000,SM39_micro001,-1,8.13,2.22,4.71 +SM40_micro000,SM40_micro001,-1,8.14,2.22,4.71 +SM40_micro000,SM40_micro002,+1,-8.02,2.22,4.68 +SM41_micro000,SM41_micro001,-1,8.01,2.22,4.59 +SM41_micro000,SM41_micro002,+1,-5.81,2.22,4.37 +SM42_micro000,SM42_micro001,-1,8.96,2.22,4.52 +SM42_micro000,SM42_micro002,+1,-7.86,2.22,4.69 +SM42_micro000,SM42_micro003,0,0.98,2.22,6.41 +SM43_micro000,SM43_micro001,-1,8.4,2.22,4.6 +SM43_micro000,SM43_micro002,+1,-7.99,2.22,4.7 +SM43_micro000,SM43_micro003,2,-15.89,2.22,9.3 +SM44_micro000,SM44_micro001,-1,8.05,2.22,4.59 +SM44_micro000,SM44_micro002,+1,-6.67,2.22,4.39 +SM45_micro000,SM45_micro001,-1,8.68,2.22,3.8 +SM45_micro000,SM45_micro002,+1,-6.66,2.22,4.42 +SM46_micro000,SM46_micro001,-1,8.19,2.22,4.63 +SM46_micro000,SM46_micro002,+1,-6.85,2.22,4.28 +SM46_micro000,SM46_micro003,2,-16.23,2.22,9.39 + + +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Robert Raddi + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Temple University + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name of the method should not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +Standard Gaussian Process + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.16 hr (compute features, 4-6 CPUs) +0.15 hr (prediction calculations, 4-6 CPUs) +0.31 hr (total, 4-6 CPUs) +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +1.0 hr to train the model (4 CPUs) +3 GHz 6-Core Intel Core i5 + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +In order of usage (1st listed is used the most) +RDKit 2020.03.4 +pandas 1.0.3 +numpy 1.18.2 +scikit-learn 0.22.1 +openeye-toolkits 2019.10.2 +openforcefield 0.6.0 + +# METHOD CATEGORY SECTION +# +# State if your prediction method is better classified as an +# experimental database lookup (DL), linear free energy relationship (LFER), +# quantitative structure-property relationship or machine learning (QSPR/ML), +# quantum mechanics without empirical correction (QM) model, quantum mechanics with +# linear empirical correction (QM+LEC), and combined quantum mechanics and molecular +# mechanics (QM+MM), or Other, using the following category labels: +# `DL`, `LFER`, `QSPR/ML`, `QM`, `QM+LEC`, `QM+MM` or `Other`. +# +# Pick only one category label. +# The `Category:` keyword is required. +Category: +QSPR/ML + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +This submission is the first step in attempts to improve a Gaussian process pKa predictor previously submitted in the SAMPL6 challenge \cite{bannan2018sampl6}. For the features and model parameters, please refer to the work by Bannan et al. The ten features used in the previous model were closely followed here. Six features are the AM1BCC partial charges described by Bannan et al. Partial charges were computed using Openforcefield and RDKit. Two other features are the free energy of solvation and the change in enthalpy. These were both computed using OpenEye-toolkits. The remaining two features are solvent accessible surface area of the deprotonated atom via the Shrake algorithm and the partial bond order via the extended Hückel molecular orbital method to obtain the overlap populations, both calculated using RDKit. +The primary difference between these two models is the training set. Our hand curated training set consists of approximately 3500 small molecules entirely from open-source databases. Approximately 40 small molecules with sulfonamide groups were added in attempt to assist predictions and increase chemical diversity. Model validation suggests a subset (1122 monoprotic) of the dataset gives the best training set for optimal kernel parameters. The optimized kernel parameters used for making predictions were obtained by executing a 4-fold model validation procedure. As suggested by Bannan et al., we added a small subset of diprotic acids and all monoprotics that had experimental pKas between 0 and 12 back into the training set (2752 total) to increase the chemical space when making predictions. +Statistical uncertainty in micro-pKa was computed during model validation. The standard Gaussian process model automatically provides uncertainties that correspond to how closely the molecule of interest overlaps with the molecules in the training set. Note that the predictions reported have have high uncertainty due to the lack of chemical space in the training set as well as the lack of similarity of the molecules of interest to that of the molecules found in the training set. Microscopic pKa values are directly related to relative free energies and were computed following the work by Gunner et al.\cite{gunner2020standard} in units of kcal/mol. + +//ref.bib +@article{bannan2018sampl6, + title={SAMPL6 challenge results from $pK_a$ predictions based on a general Gaussian process model}, + author={Bannan, Caitlin C and Mobley, David L and Skillman, A Geoffrey}, + journal={Journal of computer-aided molecular design}, + volume={32}, + number={10}, + pages={1165--1177}, + year={2018}, + publisher={Springer} +} +@article{gunner2020standard, + title={Standard state free energies, not pK as, are ideal for describing small molecule protonation and tautomeric states}, + author={Gunner, Marilyn R and Murakami, Taichi and Rustenburg, Ari{\"e}n S and I{\c{s}}{\i}k, Mehtap and Chodera, John D}, + journal={Journal of Computer-Aided Molecular Design}, + pages={1--13}, + year={2020}, + publisher={Springer} +} + + + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True + diff --git a/physical_property/pKa/Analysis/Submissions/pKa-VA-2.csv b/physical_property/pKa/Analysis/Submissions/pKa-VA-2.csv new file mode 100644 index 00000000..34fd01f6 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa-VA-2.csv @@ -0,0 +1,86 @@ +Predictions: + +SM25_micro000,SM25_micro001,1,-4.64,0.5,0.8 +SM26_micro000,SM26_micro001,1,-7.15,0.5,0.8 +SM27_micro000,SM27_micro001,1,-9.76,0.5,0.8 +SM28_micro000,SM28_micro001,1,-14.41,0.5,0.8 +SM29_micro000,SM29_micro001,1,-8.59,0.5,0.8 +SM30_micro000,SM30_micro001,1,-8.97,0.5,0.8 +SM31_micro000,SM31_micro001,1,-12.63,0.5,0.8 +SM32_micro000,SM32_micro001,1,-9.59,0.5,0.8 +SM33_micro000,SM33_micro001,1,-8.81,0.5,0.8 +SM34_micro000,SM34_micro001,1,-10.58,0.5,0.8 +SM35_micro000,SM35_micro001,1,-10.5,0.5,0.8 +SM36_micro000,SM36_micro001,1,-9.36,0.5,0.8 +SM37_micro000,SM37_micro001,1,-9.97,0.5,0.8 +SM38_micro000,SM38_micro001,1,-6.11,0.5,0.8 +SM39_micro000,SM39_micro001,1,-5.84,0.5,0.8 +SM40_micro000,SM40_micro001,1,-8.76,0.5,0.8 +SM41_micro000,SM41_micro001,1,-10.77,0.5,0.8 +SM42_micro000,SM42_micro001,1,-8.96,0.5,0.8 +SM43_micro000,SM43_micro001,1,-9.73,0.5,0.8 +SM44_micro000,SM44_micro001,1,-9.39,0.5,0.8 +SM45_micro000,SM45_micro001,1,-10.72,0.5,0.8 +SM46_micro000,SM46_micro001,1,-13.38,0.5,0.8 + +Participant name: +Viktorya Aviyente +,,,,, +Participant organization: +Bogazici University + +Name: +RFE-prediction-M052X + +Compute time: +30 hours + +Computing and hardware: +All the simulations were performed with GPU01 and CURIE clusters +GPU01, Intel Xeon E5-2697A /32 Core-NVIDIA GK210GL [Tesla K80] x 4 G4GB RAM +CURIE, Intel Xeon E7-4870 v2s/120 core-512GB RAM + +Software: +Spartan14 V1.1.0 +GaussView 6.0.16 +Gaussian16-RevA.03 + +Category: +Physical (QM) + +Method: +All possible conformations of the ligands are located with the semi-empirical PM3 method by using the SPARTAN software. Free rotations around single bonds are taken into account and all the geometries corresponding to stationary points are re-optimised with the Gaussian software package by density functional theory (DFT) using the M052X functional and the 6-31+G(d,p) basis set. Solvation Model based on Density (SMD) method at 298.15K is used for the optimizations of the conformations in water. +Thermodynamic Cycle 3 was used to calculate the pKa values of the molecules for the global minima (Equation 1) +pKa(AH)=(DG_soln/2.303RT)+pKa(BH) (Equation 1) +where BH represents a reference molecule which is similar in structure to the molecules questioned +For the calculation of standard error of mean (SEM), standard deviations (stdev) are calculated according to the Equation 2 where value is the RFE of each molecule, mean is the mean of RFEs of the molecules of interest and N is the size of the population +stdev = sqrt((1/(N-1))*(sum of (value-mean)^2)) (Equation 2) +After calculation of standard deviations, standard error of mean (SEM) values are calculated according to the Equation 3 +SEM=stdev/sqrt(N) (Equation 3) + +Macro pKa predictions: +SM25,4.64,0,-1 +SM26,7.15,0,-1 +SM27,9.76,0,-1 +SM28,14.41,0,-1 +SM29,8.59,0,-1 +SM30,8.97,0,-1 +SM31,12.63,0,-1 +SM32,9.59,0,-1 +SM33,8.81,0,-1 +SM34,10.58,0,-1 +SM35,10.5,0,-1 +SM36,9.36,0,-1 +SM37,9.97,0,-1 +SM38,6.11,0,-1 +SM39,5.84,0,-1 +SM40,8.76,0,-1 +SM41,10.77,0,-1 +SM42,8.96,0,-1 +SM43,9.73,0,-1 +SM44,9.39,0,-1 +SM45,10.72,0,-1 +SM46,13.38,0,-1 + +Ranked: +True diff --git a/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_1.csv b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_1.csv new file mode 100644 index 00000000..302de9d9 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_1.csv @@ -0,0 +1,152 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# PREDICTION SECTION +# +Predictions: +SM25_micro000,SM25_micro001,-1,11.36,0,0.5 +SM25_micro000,SM25_micro002,0,-14.89,0,0.5 +SM25_micro000,SM25_micro003,-1,-8.14,0,0.5 +SM26_micro000,SM26_micro001,-1,-4.33,0,0.5 +SM26_micro000,SM26_micro002,0,19.50,0,0.5 +SM26_micro000,SM26_micro003,-1,-21.00,0,0.5 +SM27_micro000,SM27_micro001,-1,-12.99,0,0.5 +SM28_micro000,SM28_micro001,0,25.74,0,0.5 +SM28_micro000,SM28_micro002,-1,-16.37,0,0.5 +SM28_micro000,SM28_micro003,1,2.79,0,0.5 +SM29_micro000,SM29_micro001,-1,-11.67,0,0.5 +SM30_micro000,SM30_micro001,-1,-10.37,0,0.5 +SM31_micro000,SM31_micro001,-1,-11.53,0,0.5 +SM32_micro000,SM32_micro001,-1,-12.03,0,0.5 +SM33_micro000,SM33_micro001,-1,-10.13,0,0.5 +SM34_micro000,SM34_micro001,-1,-12.16,0,0.5 +SM35_micro000,SM35_micro001,-1,-10.72,0,0.5 +SM35_micro000,SM35_micro002,0,0.72,0,0.5 +SM35_micro000,SM35_micro003,-1,-13.04,0,0.5 +SM36_micro000,SM36_micro001,-1,-9.03,0,0.5 +SM36_micro000,SM36_micro002,0,-0.85,0,0.5 +SM36_micro000,SM36_micro003,-1,-9.86,0,0.5 +SM37_micro000,SM37_micro001,1,7.40,0,0.5 +SM37_micro000,SM37_micro002,-1,-9.26,0,0.5 +SM37_micro000,SM37_micro003,0,0.14,0,0.5 +SM37_micro000,SM37_micro004,-1,-7.79,0,0.5 +SM37_micro000,SM37_micro005,1,6.45,0,0.5 +SM38_micro000,SM38_micro001,-1,-12.38,0,0.5 +SM39_micro000,SM39_micro001,-1,-8.64,0,0.5 +SM40_micro000,SM40_micro001,-1,-8.98,0,0.5 +SM40_micro000,SM40_micro002,1,7.83,0,0.5 +SM41_micro000,SM41_micro001,-1,-6.22,0,0.5 +SM41_micro000,SM41_micro002,1,7.18,0,0.5 +SM42_micro000,SM42_micro001,-1,-5.50,0,0.5 +SM42_micro000,SM42_micro002,1,7.85,0,0.5 +SM43_micro000,SM43_micro001,-1,-5.74,0,0.5 +SM43_micro000,SM43_micro002,1,8.00,0,0.5 +SM43_micro000,SM43_micro003,2,44.39,0,0.5 +SM44_micro000,SM44_micro001,-1,-6.53,0,0.5 +SM44_micro000,SM44_micro002,1,5.43,0,0.5 +SM45_micro000,SM45_micro001,-1,-6.85,0,0.5 +SM45_micro000,SM45_micro002,1,6.03,0,0.5 +SM46_micro000,SM46_micro001,-1,-7.24,0,0.5 +SM46_micro000,SM46_micro002,1,5.30,0,0.5 +SM46_micro000,SM46_micro003,2,39.25,0,0.5 + +# +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch + +# +Participant organization: +University of Santiago del Estero, Argentina +Miami University + +# +Name: +RFE-SMD-explicit-water + +# +# COMPUTE TIME SECTION +# +Compute time: +5 hours, CPU + +# +# COMPUTING AND HARDWARE SECTION +# +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +Software: +Gaussian 16 + +# METHOD CATEGORY SECTION +# +Category: +QM quantum mechanics without empirical correction + +# METHOD DESCRIPTION SECTION +# +Method: +Calculations were performed by the Gaussian 16 revision E01 series of programs using one hybrid density functional M06-2X, 6-31+G(d,p) as basis set, +and the SMD implicit solvation model. Geometries were fully optimized in aqueous solution. Harmonic frequencies were calculated to confirm that the +structures were minima on the potential energy surface and to obtain the thermal and entropic contributions to the free energies. +The calculation of pKa was based on the use of the direct approach, given by the proton dissociation reaction. The pKa value of the +molecule HA was calculated according to the following equation + pKa= G*aq/2.303RT; G*aq= G*aq,A- + G*aq,H+ - G*aq,HA +where G*aq,A- and G*aq,HA are the standard free energies of the deprotonated and protonated species, respectively, calculated directly in aqueous solution at 298.15 K. +To improve the calculation of the solvation effects that are important for pKa calculations, mainly in charged structures, we included one explicit water molecules +directly hydrogen-bonded to the site being protonated/deprotonated. For each hydrogen bonding site, several orientations of the added water were considered and only +the lowest-energy structure was used. + +Macro pKa values +# compound name, macro pKa, initial formal charge, formal charge after transition, e.g.: +SM25, 8.14, 0, -1 +SM26, 4.33, 0, -1 +SM26, 6.70, 0, -1 +SM27, 12.99, 0, -1 +SM28, 16.37, 0, -1 +SM28, -2.79, +1, 0 +SM29, 11.67, 0, -1 +SM30, 10.37, 0, -1 +SM31, 11.53, 0, -1 +SM32, 12.03, 0, -1 +SM33, 10.13, 0, -1 +SM34, 12.16, 0, -1 +SM35, 10.72, 0, -1 +SM35, 12.52, 0, -1 +SM36, 9.03, 0, -1 +SM36, 10.48, 0, -1 +SM37, 9.26, 0, -1 +SM37, 9.15, 0, -1 +SM37, -7.40, +1, 0 +SM37, -6.45, +1, 0 +SM38, 12.38, 0, -1 +SM39, 8.64, 0, -1 +SM40, 8.98, 0, -1 +SM40, -7.83, +1, 0 +SM41, 6.22, 0, -1 +SM41, -7.18, +1, 0 +SM42, 5.50, 0, -1 +SM42, -7.85, +1, 0 +SM43, 5.74, 0, -1 +SM43, -8.00, +1, 0 +SM43, -22.87, +2, 0 +SM44, 6.53, 0, -1 +SM44, -5.43, +1, 0 +SM45, 6.85, 0, -1 +SM45, -6.03, +1, 0 +SM46, 7.24, 0, -1 +SM46, -5.30, +1, 0 +SM46, -19.63, +2, 0 + +# +Ranked: +True diff --git a/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_2.csv b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_2.csv new file mode 100644 index 00000000..c61aaf40 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_2.csv @@ -0,0 +1,146 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# PREDICTION SECTION +# +Predictions: +SM25_micro000,SM25_micro001,-1, 10.90,0,0.4 +SM25_micro000,SM25_micro002,0,-13.46,0,0.4 +SM25_micro000,SM25_micro003,-1,-9.16,0,0.4 +SM26_micro000,SM26_micro001,-1,-6.05,0,0.4 +SM26_micro000,SM26_micro002,0,23.13,0,0.4 +SM26_micro000,SM26_micro003,-1,-23.44,0,0.4 +SM27_micro000,SM27_micro001,-1,-13.84,0,0.4 +SM28_micro000,SM28_micro001,0,28.41,0,0.4 +SM28_micro000,SM28_micro002,-1,-19.34,0,0.4 +SM28_micro000,SM28_micro003,1,4.57,0,0.4 +SM29_micro000,SM29_micro001,-1,-12.36,0,0.4 +SM30_micro000,SM30_micro001,-1,-12.86,0,0.4 +SM31_micro000,SM31_micro001,-1,-12.75,0,0.4 +SM32_micro000,SM32_micro001,-1,-12.34,0,0.4 +SM33_micro000,SM33_micro001,-1,-12.44,0,0.4 +SM34_micro000,SM34_micro001,-1,-12.41,0,0.4 +SM35_micro000,SM35_micro001,-1,-11.83,0,0.4 +SM35_micro000,SM35_micro002,0,-1.85,0,0.4 +SM35_micro000,SM35_micro003,-1,-12.97,0,0.4 +SM36_micro000,SM36_micro001,-1,-10.85,0,0.4 +SM36_micro000,SM36_micro002,0,-2.60,0,0.4 +SM36_micro000,SM36_micro003,-1,-10.71,0,0.4 +SM37_micro000,SM37_micro001,1,3.93,0,0.4 +SM37_micro000,SM37_micro002,-1,-8.20,0,0.4 +SM37_micro000,SM37_micro003,0,-1.87,0,0.4 +SM37_micro000,SM37_micro004,-1,-8.20,0,0.4 +SM37_micro000,SM37_micro005,1,3.14,0,0.4 +SM38_micro000,SM38_micro001,-1,-12.72,0,0.4 +SM39_micro000,SM39_micro001,-1,-10.64,0,0.4 +SM40_micro000,SM40_micro001,-1,-10.50,0,0.4 +SM40_micro000,SM40_micro002,1,6.60,0,0.4 +SM41_micro000,SM41_micro001,-1,-7.41,0,0.4 +SM41_micro000,SM41_micro002,1,5.53,0,0.4 +SM42_micro000,SM42_micro001,-1,-1.83,0,0.4 +SM42_micro000,SM42_micro002,1,1.22,0,0.4 +SM43_micro000,SM43_micro001,-1,-2.16,0,0.4 +SM43_micro000,SM43_micro002,1,-8.00,0,0.4 +SM43_micro000,SM43_micro003,2,25.54,0,0.4 +SM44_micro000,SM44_micro001,-1,-8.97,0,0.4 +SM44_micro000,SM44_micro002,1,4.49,0,0.4 +SM45_micro000,SM45_micro001,-1,-8.59,0,0.4 +SM45_micro000,SM45_micro002,1,3.96,0,0.4 +SM46_micro000,SM46_micro001,-1,-8.49,0,0.4 +SM46_micro000,SM46_micro002,1,3.98,0,0.4 +SM46_micro000,SM46_micro003,2,28.62,0,0.4 +# +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch +# +Participant organization: +University of Santiago del Estero, Argentina +Miami University +# +Name: +RFE-SMD-SAS +# +# COMPUTE TIME SECTION +# +Compute time: +3 hours, CPU +# +# COMPUTING AND HARDWARE SECTION +# +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +Software: +Gaussian 16 + +# METHOD CATEGORY SECTION +# +Category: +QM quantum mechanics + +# METHOD DESCRIPTION SECTION +# +Method: +Calculations were performed by the Gaussian 16 revision E01 series of programs using one hybrid density functional M06-2X, 6-31+G(d,p) as basis set, +and the SMD implicit solvation model. Geometries were fully optimized in aqueous solution. Harmonic frequencies were calculated to confirm that the +structures were minima on the potential energy surface and to obtain the thermal and entropic contributions to the free energies. +In the geometry optimizations, the surface type and the scaling factor options in the SCRF section were tuned. By choosing SAS as the +solute-solvent boundary, the solvent radius (1.385 A for water) is added to the intrinsic Coulomb radii to construct the cavity. +It is necessary to add the scale factor a = 0.485 and the surface of SAS. +The calculation of pKa was based on the use of the direct approach, given by the proton dissociation reaction. The pKa value of the +molecule HA was calculated according to the following equation + pKa= G*aq/2.303RT; G*aq= G*aq,A- + G*aq,H+ - G*aq,HA +where G*aq,A- and G*aq,HA are the standard free energies of the deprotonated and protonated species, respectively, calculated directly in aqueous solution at 298.15 K. + +Macro pKa values +# compound name, macro pKa, initial formal charge, formal charge after transition, e.g.: +SM25, 9.16, 0, -1 +SM26, 6.05, 0, -1 +SM26, 6.48, 0, -1 +SM27, 13.84, 0, -1 +SM28, 19.34, 0, -1 +SM28, -4.59, +1, 0 +SM29, 12.36, 0, -1 +SM30, 12.85, 0, -1 +SM31, 12.75, 0, -1 +SM32, 12.34, 0, -1 +SM33, 12.44, 0, -1 +SM34, 12.41, 0, -1 +SM35, 11.83, 0, -1 +SM35, 14.33, 0, -1 +SM36, 10.85, 0, -1 +SM36, 12.61, 0, -1 +SM37, 8.20, 0, -1 +SM37, 9.57, 0, -1 +SM37, -3.93, +1, 0 +SM37, -4.51, +1, 0 +SM38, 12.72, 0, -1 +SM39, 10.64, 0, -1 +SM40, 10.50, 0, -1 +SM40, -6.60, +1, 0 +SM41, 7.41, 0, -1 +SM41, -5.53, +1, 0 +SM42, 1.83, 0, -1 +SM42, -1.22, +1, 0 +SM43, 2.16, 0, -1 +SM43, -8.00, +1, 0 +SM43, -12.77, +2, 0 +SM44, 8.97, 0, -1 +SM44, -4.49, +1, 0 +SM45, 8.59, 0, -1 +SM45, -3.96, +1, 0 +SM46, 8.49, 0, -1 +SM46, -3.98, +1, 0 +SM46, -14.31, +2, 0 +# +Ranked: +False diff --git a/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_3.csv b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_3.csv new file mode 100644 index 00000000..9c00ddd8 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa_RodriguezPaluch_SMD_3.csv @@ -0,0 +1,143 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# PREDICTION SECTION +# +Predictions: +SM25_micro000,SM25_micro001,-1,10.49,0, 1.02 +SM25_micro000,SM25_micro002,0,-12.80,0,1.02 +SM25_micro000,SM25_micro003,-1,-9.16,0,1.02 +SM26_micro000,SM26_micro001,-1,-5.94,0,1.02 +SM26_micro000,SM26_micro002,0,20.81,0,1.02 +SM26_micro000,SM26_micro003,-1,-22.11,0,1.02 +SM27_micro000,SM27_micro001,-1,-13.86,0,1.02 +SM28_micro000,SM28_micro001,0,25.72,0,1.02 +SM28_micro000,SM28_micro002,-1,-19.08,0,1.02 +SM28_micro000,SM28_micro003,1,7.28,0,1.02 +SM29_micro000,SM29_micro001,-1,-13.60,0,1.02 +SM30_micro000,SM30_micro001,-1,-12.90,0,1.02 +SM31_micro000,SM31_micro001,-1,-13.16,0,1.02 +SM32_micro000,SM32_micro001,-1,-13.74,0,1.02 +SM33_micro000,SM33_micro001,-1,-11.88,0,1.02 +SM34_micro000,SM34_micro001,-1,-14.35,0,1.02 +SM35_micro000,SM35_micro001,-1,-11.70,0,1.02 +SM35_micro000,SM35_micro002,0,-1.12,0,1.02 +SM35_micro000,SM35_micro003,-1,-13.20,0,1.02 +SM36_micro000,SM36_micro001,-1,-9.53,0,1.02 +SM36_micro000,SM36_micro002,0,-3.22,0,1.02 +SM36_micro000,SM36_micro003,-1,-9.51,0,1.02 +SM37_micro000,SM37_micro001,1,8.52,0,1.02 +SM37_micro000,SM37_micro002,-1,-9.07,0,1.02 +SM37_micro000,SM37_micro003,0,-3.03,0,1.02 +SM37_micro000,SM37_micro004,-1,-9.07,0,1.02 +SM37_micro000,SM37_micro005,1,7.72,0,1.02 +SM38_micro000,SM38_micro001,-1,-12.42,0,1.02 +SM39_micro000,SM39_micro001,-1,-10.16,0,1.02 +SM40_micro000,SM40_micro001,-1,-11.46,0,1.02 +SM40_micro000,SM40_micro002,1,11.28,0,1.02 +SM41_micro000,SM41_micro001,-1,-6.21,0,1.02 +SM41_micro000,SM41_micro002,1,9.38,0,1.02 +SM42_micro000,SM42_micro001,-1,-0.39,0,1.02 +SM42_micro000,SM42_micro002,1,4.63,0,1.02 +SM43_micro000,SM43_micro001,-1,-1.72,0,1.02 +SM43_micro000,SM43_micro002,1,8.00,0,1.02 +SM43_micro000,SM43_micro003,2,44.38,0,1.02 +SM44_micro000,SM44_micro001,-1,-8.85,0,1.02 +SM44_micro000,SM44_micro002,1,7.47,0,1.02 +SM45_micro000,SM45_micro001,-1,-8.25,0,1.02 +SM45_micro000,SM45_micro002,1,8.01,0,1.02 +SM46_micro000,SM46_micro001,-1,-8.73,0,1.02 +SM46_micro000,SM46_micro002,1,6.98,0,1.02 +SM46_micro000,SM46_micro003,2,46.30,0,1.02 +# +Participant name: +Sergio Antonio Rodriguez +Andrew S. Paluch +# +Participant organization: +University of Santiago del Estero, Argentina +Miami University +# +Name: +RFE-SMD +# +# COMPUTE TIME SECTION +# +Compute time: +5 hours, CPU +# +# COMPUTING AND HARDWARE SECTION +# +Computing and hardware: +All simulations were performed on the Pitzer Cluster at the Ohio Supercomputer Center +(https://www.osc.edu/resources/technical_support/supercomputers/pitzer). +The reported compute time is taken as the walltime times the number of processors (core hours). +Pitzer has a mix of processors. We did not specify the processor type upon submission, so a mix was used. +The original Pitzer cluster was installed in late 2018 and +is a Dell-built, Intel Xeon Skylake processor-based supercomputer with 260 nodes. +In September 2020, OSC installed an additional 398 Intel Xeon Cascade Lake processor-based +nodes as part of a Pitzer Expansion cluster. + +# SOFTWARE SECTION +# +Software: +Gaussian 16 + +# METHOD CATEGORY SECTION +# +Category: +QM quantum mechanics + +# METHOD DESCRIPTION SECTION +# +Method: +Calculations were performed by the Gaussian 16 revision E01 series of programs using one hybrid density functional M06-2X, 6-31+G(d,p) as basis set, +and the SMD implicit solvation model. Geometries were fully optimized in aqueous solution. Harmonic frequencies were calculated to confirm that +the structures were minima on the potential energy surface and to obtain the thermal and entropic contributions to the free energies. +The calculation of pKa was based on the use of the direct approach, given by the proton dissociation reaction. The pKa value of the molecule HA +was calculated according to the following equation + pKa= G*aq/2.303RT; G*aq=G*aq,A- + G*aq,H+ - G*aq,HA +where G*aq,A- and G*aq,HA are the standard free energies of the deprotonated and protonated species, respectively, calculated directly in aqueous solution at 298.15 K. + +Macro pKa values +# compound name, macro pKa, initial formal charge, formal charge after transition, e.g.: +SM25, 9.16, 0, -1 +SM26, 5.94, 0, -1 +SM26, 6.85, 0, -1 +SM27, 13.86, 0, -1 +SM28, 19.08, 0, -1 +SM28, -7.28, +1, 0 +SM29, 13.60, 0, -1 +SM30, 12.90, 0, -1 +SM31, 13.16, 0, -1 +SM32, 13.74, 0, -1 +SM33, 11.88, 0, -1 +SM34, 14.35, 0, -1 +SM35, 11.70, 0, -1 +SM35, 14.02, 0, -1 +SM36, 9.53, 0, -1 +SM36, 11.87, 0, -1 +SM37, 9.07, 0, -1 +SM37, 11.29, 0, -1 +SM37, -8.52, +1, 0 +SM37, -9.94, +1, 0 +SM38, 12.42, 0, -1 +SM39, 10.16, 0, -1 +SM40, 11.46, 0, -1 +SM40, -11.28, +1, 0 +SM41, 6.21, 0, -1 +SM41, -9.38, +1, 0 +SM42, 0.39, 0, -1 +SM42, -4.63, +1, 0 +SM43, 1.72, 0, -1 +SM43, -8.00, +1, 0 +SM43, -22.19, +2, 0 +SM44, 8.85, 0, -1 +SM44, -7.47, +1, 0 +SM45, 8.25, 0, -1 +SM45, -8.01, +1, 0 +SM46, 8.73, 0, -1 +SM46, -6.98, +1, 0 +SM46, -23.15, +2, 0 +# +Ranked: +False diff --git a/physical_property/pKa/Analysis/Submissions/pKa_prediction_Iorga_Beckstein_1.csv b/physical_property/pKa/Analysis/Submissions/pKa_prediction_Iorga_Beckstein_1.csv new file mode 100644 index 00000000..b445df4c --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pKa_prediction_Iorga_Beckstein_1.csv @@ -0,0 +1,198 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit relative free energy (RFE) predictions for all 22 molecules. Incomplete submissions will not be accepted. +# Please report RFE standard error of the mean (SEM) and RFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty +# +# If you have evaluated additional microstates, include the following: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty, SMILES string of the predicted microstate +# +# The molecule ID of the other microstate needs to be in the format: `SMXX_extra001` (number can vary) +# Also email us the `.mol2` file of your microstate with explicit hydrogens and correct bond orders, +# Please send the `.mol2` file to the email listed on the instructions page. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25_micro000,SM25_micro001,-1,-2.70,0.00,0.90 +SM25_micro000,SM25_micro002,0,-2.00,0.00,0.90 +SM25_micro000,SM25_micro003,-1,4.53,0.00,0.90 +SM25_micro000,SM25_micro004,0,-5.75,0.00,0.90 +SM25_micro000,SM25_micro005,+1,0.65,0.00,0.90 +SM26_micro000,SM26_micro001,-1,3.02,0.00,0.90 +SM26_micro000,SM26_micro002,0,6.95,0.00,0.90 +SM26_micro000,SM26_micro003,-1,11.12,0.00,0.90 +SM26_micro000,SM26_micro004,0,3.78,0.00,0.90 +SM26_micro000,SM26_micro005,+1,6.34,0.00,0.90 +SM27_micro000,SM27_micro001,-1,6.11,0.00,0.90 +SM28_micro000,SM28_micro001,0,8.29,0.00,0.90 +SM28_micro000,SM28_micro002,-1,8.78,0.00,0.90 +SM28_micro000,SM28_micro003,+1,3.91,0.00,0.90 +SM28_micro000,SM28_micro004,-1,18.70,0.00,0.90 +SM29_micro000,SM29_micro001,-1,6.33,0.00,0.90 +SM30_micro000,SM30_micro001,-1,5.86,0.00,0.90 +SM31_micro000,SM31_extra001,+1,4.51,0.00,0.90,C[NH+](S(=O)(NC1(CCc2ccccc2)COC1)=O)C +SM31_micro000,SM31_micro001,-1,6.00,0.00,0.90 +SM32_micro000,SM32_micro001,-1,5.91,0.00,0.90 +SM33_micro000,SM33_micro001,-1,5.17,0.00,0.90 +SM34_micro000,SM34_extra001,+1,4.37,0.00,0.90,C[NH+](S(=O)(NC1(CCc2ccccc2)CSC1)=O)C +SM34_micro000,SM34_micro001,-1,5.73,0.00,0.90 +SM35_micro000,SM35_micro001,-1,3.20,0.00,0.90 +SM35_micro000,SM35_micro002,0,-1.89,0.00,0.90 +SM35_micro000,SM35_micro003,-1,3.20,0.00,0.90 +SM36_micro000,SM36_micro001,-1,5.22,0.00,0.90 +SM36_micro000,SM36_micro002,0,0.38,0.00,0.90 +SM36_micro000,SM36_micro003,-1,5.23,0.00,0.90 +SM37_micro000,SM37_micro001,+1,2.13,0.00,0.90 +SM37_micro000,SM37_micro002,-1,3.70,0.00,0.90 +SM37_micro000,SM37_micro003,0,-1.49,0.00,0.90 +SM37_micro000,SM37_micro004,-1,3.78,0.00,0.90 +SM37_micro000,SM37_micro005,+1,1.26,0.00,0.90 +SM38_micro000,SM38_micro001,-1,5.22,0.00,0.90 +SM39_micro000,SM39_micro001,-1,5.13,0.00,0.90 +SM40_micro000,SM40_micro001,-1,4.98,0.00,0.90 +SM40_micro000,SM40_micro002,+1,6.17,0.00,0.90 +SM41_micro000,SM41_micro001,-1,2.17,0.00,0.90 +SM41_micro000,SM41_micro002,+1,3.09,0.00,0.90 +SM42_micro000,SM42_micro001,-1,0.15,0.00,0.90 +SM42_micro000,SM42_micro002,+1,0.79,0.00,0.90 +SM42_micro000,SM42_micro003,0,-1.95,0.00,0.90 +SM43_micro000,SM43_micro001,-1,0.50,0.00,0.90 +SM43_micro000,SM43_micro002,+1,2.95,0.00,0.90 +SM43_micro000,SM43_micro003,+2,6.25,0.00,0.90 +SM43_micro000,SM43_micro004,0,-2.08,0.00,0.90 +SM43_micro000,SM43_micro005,+1,0.41,0.00,0.90 +SM44_micro000,SM44_micro001,-1,4.25,0.00,0.90 +SM44_micro000,SM44_micro002,+1,3.55,0.00,0.90 +SM45_micro000,SM45_micro001,-1,4.39,0.00,0.90 +SM45_micro000,SM45_micro002,+1,3.68,0.00,0.90 +SM46_micro000,SM46_micro001,-1,4.00,0.00,0.90 +SM46_micro000,SM46_micro002,+1,3.51,0.00,0.90 +SM46_micro000,SM46_micro003,+2,8.07,0.00,0.90 +SM46_micro000,SM46_micro004,+1,5.18,0.00,0.90 +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Bogdan I. Iorga/Oliver Beckstein +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +ICSN, CNRS, Gif-sur-Yvette, France/Arizona State University, USA +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name of the method should not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +pKa_SAMPL7_Gaussian_corrected +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +53 hours, CPU +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All the simulations were performed in parallel (8 cores for each simulation) on cluster nodes running with CentOS6 and 4 CPU Intel Xeon E5-4627 v3 @ 2.60GHz. +# +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Corina 4.2.0 +Gaussian 09, Revision D.01 +GaussView5 +# +# METHOD CATEGORY SECTION +# +# State if your prediction method is better classified as an +# experimental database lookup (DL), linear free energy relationship (LFER), +# quantitative structure-property relationship or machine learning (QSPR/ML), +# quantum mechanics without empirical correction (QM) model, quantum mechanics with +# linear empirical correction (QM+LEC), and combined quantum mechanics and molecular +# mechanics (QM+MM), or Other, using the following category labels: +# `DL`, `LFER`, `QSPR/ML`, `QM`, `QM+LEC`, `QM+MM` or `Other`. +# +# Pick only one category label. +# The `Category:` keyword is required. +Category: +QM+LEC +# +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# Use as many lines of text as you need. +# +# We strongly encourage you to submit your predicted macro pKa values in this section in consecutive lines following this format: +# compound name, macro pKa, initial formal charge, formal charge after transition, e.g.: +# SM25, 3.5, 0, +1 +# This will allow us to check that our analysis of your free energies leads to the same endpoint as your analysis. +# +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +For the pKa prediction we followed the method that we used in the SAMPL6 pKa prediction challenge (Selwa, E., Kenney, I. M., Beckstein, O., and Iorga, B. I. SAMPL6: calculation of macroscopic pKa values from ab initio quantum mechanical free energies, J. Comput. Aided Mol. Des. 2018, 32, 1203-1216 DOI: 10.1007/s10822-018-0138-6.), which is based on the method described in Muckerman JT, Skone JH, Ning M, Wasada-Tsutsui Y. Toward the accurate calculation of pKa values in water and acetonitrile. Biochim Biophys Acta. 2013, 1827, 882-891. +With this method we calculated the relative energies for the microstates provided by the organizers, as well as for 4 reference compounds with structures similar compared with the compounds from the SAMPL7-pKa dataset and for which experimental pKa values are available. These reference compounds are compound 26 from Borhade SR, Svensson R, Brandt P, Artursson P, Arvidsson PI, Sandstrom A. Preclinical characterization of acyl sulfonimidamides: potential carboxylic acid bioisosteres with tunable properties. ChemMedChem. 2015 Mar;10(3):455-60. doi: 10.1002/cmdc.201402497. and compounds 12, 13 and 14 from Lassalas P, Gay B, Lasfargeas C, James MJ, Tran V, Vijayendran KG, Brunden KR, Kozlowski MC, Thomas CJ, Smith AB 3rd, Huryn DM, Ballatore C. Structure Property Relationships of Carboxylic Acid Isosteres. J Med Chem. 2016 Apr 14;59(7):3183-203. doi: 10.1021/acs.jmedchem.5b01963. +The experimental values compared with the energy corresponding to the difference between the experimental and computed pKa values were plotted, and the linear fitting of these values (R2=0.867) using Microsoft Excel provided the correction to be applied to the SAMPL7 pKa calculations. +Therefore, the microscopic relative free energies and the macroscopic pKa values reported here were corrected using this procedure to remove the systematic error related to the QM calculations. +# +Macro pKa values: +SM25, 2.24, -1, 0 +SM26, 2.22, -1, 0 +SM27, 4.48, -1, 0 +SM28, 6.44, -1, 0 +SM29, 4.64, -1, 0 +SM30, 4.30, -1, 0 +SM31, 4.40, -1, 0 +SM32, 4.33, -1, 0 +SM33, 3.79, -1, 0 +SM34, 4.20, -1, 0 +SM35, 3.45, -1, 0 +SM36, 3.71, -1, 0 +SM37, 3.57, -1, 0 +SM38, 3.83, -1, 0 +SM39, 3.76, -1, 0 +SM40, 3.66, -1, 0 +SM41, 1.59, -1, 0 +SM42, 1.56, -1, 0 +SM43, 1.90, -1, 0 +SM44, 3.12, -1, 0 +SM45, 3.22, -1, 0 +SM46, 2.93, -1, 0 +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1.csv b/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1.csv new file mode 100644 index 00000000..56a25800 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1.csv @@ -0,0 +1,141 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit relative free energy (RFE) predictions for all 22 molecules. Incomplete submissions will not be accepted. +# Please report RFE standard error of the mean (SEM) and RFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty +# +# If you have evaluated additional microstates, include the following: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty, SMILES string of the predicted microstate +# +# The molecule ID of the other microstate needs to be in the format: `SMXX_extra001` (number can vary) +# Also email us the `.mol2` file of your microstate with explicit hydrogens and correct bond orders, +# Please send the `.mol2` file to the email listed on the instructions page. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +#ref,refid,charge,RFE,SEM,MOD +SM25_micro000,SM25_micro001,-1,-302.80,0.00,0.00 +SM26_micro000,SM26_micro001,-1,-304.26,0.00,0.00 +SM27_micro000,SM27_micro001,-1,-303.54,0.00,0.00 +SM28_micro000,SM28_micro001,0,2.44,0.00,0.00 +SM29_micro000,SM29_micro001,-1,-303.29,0.00,0.00 +SM30_micro000,SM30_micro001,-1,-315.28,0.00,0.00 +SM31_micro000,SM31_micro001,-1,-303.80,0.00,0.00 +SM32_micro000,SM32_micro001,-1,-298.44,0.00,0.00 +SM33_micro000,SM33_micro001,-1,-301.80,0.00,0.00 +SM34_micro000,SM34_micro001,-1,-302.30,0.00,0.00 +SM35_micro000,SM35_micro001,-1,-295.74,0.00,0.00 +SM36_micro000,SM36_micro001,-1,-297.05,0.00,0.00 +SM37_micro000,SM37_micro001,1,210.31,0.00,0.00 +SM38_micro000,SM38_micro001,-1,-296.78,0.00,0.00 +SM39_micro000,SM39_micro001,-1,-296.99,0.00,0.00 +SM40_micro000,SM40_micro001,-1,-296.92,0.00,0.00 +SM41_micro000,SM41_micro001,-1,-304.67,0.00,0.00 +SM42_micro000,SM42_micro001,-1,-302.42,0.00,0.00 +SM43_micro000,SM43_micro001,-1,-303.06,0.00,0.00 +SM44_micro000,SM44_micro001,-1,-303.54,0.00,0.00 +SM45_micro000,SM45_micro001,-1,-302.66,0.00,0.00 +SM46_micro000,SM46_micro001,-1,-304.30,0.00,0.00 +SM25_micro000,SM25_micro002,0,-2.33,0.00,0.00 +SM26_micro000,SM26_micro002,0,0.45,0.00,0.00 +SM28_micro000,SM28_micro002,-1,-297.34,0.00,0.00 +SM35_micro000,SM35_micro002,0,-0.13,0.00,0.00 +SM36_micro000,SM36_micro002,0,0.00,0.00,0.00 +SM37_micro000,SM37_micro002,-1,-299.50,0.00,0.00 +SM40_micro000,SM40_micro002,1,210.08,0.00,0.00 +SM41_micro000,SM41_micro002,1,217.75,0.00,0.00 +SM42_micro000,SM42_micro002,1,219.72,0.00,0.00 +SM43_micro000,SM43_micro002,1,218.66,0.00,0.00 +SM44_micro000,SM44_micro002,1,218.37,0.00,0.00 +SM45_micro000,SM45_micro002,1,219.23,0.00,0.00 +SM46_micro000,SM46_micro002,1,215.55,0.00,0.00 +SM25_micro000,SM25_micro003,-1,-302.63,0.00,0.00 +SM26_micro000,SM26_micro003,-1,-306.95,0.00,0.00 +SM28_micro000,SM28_micro003,1,206.83,0.00,0.00 +SM35_micro000,SM35_micro003,-1,-298.00,0.00,0.00 +SM36_micro000,SM36_micro003,-1,-297.05,0.00,0.00 +SM37_micro000,SM37_micro003,0,-2.58,0.00,0.00 +SM43_micro000,SM43_micro003,2,359.33,0.00,0.00 +SM46_micro000,SM46_micro003,2,356.00,0.00,0.00 +SM37_micro000,SM37_micro004,-1,-299.50,0.00,0.00 + + +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Michael Jones + +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +NIH NHLBI + +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +RFE-NHLBI-TZVP-QM + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gaussian 09/16 +OpenBabel +MOE 2018 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + + +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +30 hours, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All calculations were performed on a Biowulf and Lobos clusters at the National Institutes of Health. + + + + +# METHOD DESCRIPTION SECTION +Method: +Def2-TZVP basis sets were used for all calculations. +All calculations were performed in either Gaussian 09 or Gaussian 16. All challenge molecules were converted to 3d coordinates using OpenBabel from the SMILES string. Structures were then optimized with the B3LYP density functional and were verified to be local minima via frequency calculations on an ‘ultrafine’ integration grid with harmonic frequencies. Relative Free Energies (RFE) were determined by using the thermodynamic cycle to calculate the "reaction free energies" (i.e., HA-> A- + H+, including the proton free energy of solvation) in both the gas and aqueous phases. Following, the RFE was determined by determining the difference between the enumerated microstate's reaction and the reference microstate (i.e., ∆Gsolv(SMXX_micro001)-∆Gsolv(SMXX_micro000)). All RFE's are reported in kcal/more. + + +# +Ranked: +True diff --git a/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1_L0OUNi2.csv b/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1_L0OUNi2.csv new file mode 100644 index 00000000..56a25800 --- /dev/null +++ b/physical_property/pKa/Analysis/Submissions/pka-nhlbi-1_L0OUNi2.csv @@ -0,0 +1,141 @@ +# RELATIVE FREE ENERGY PREDICTIONS (for pKa prediction) +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit relative free energy (RFE) predictions for all 22 molecules. Incomplete submissions will not be accepted. +# Please report RFE standard error of the mean (SEM) and RFE model uncertainty. +# +# The data in each prediction line should be structured as follows: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty +# +# If you have evaluated additional microstates, include the following: +# Microstate ID of reference state, Microstate ID of the predicted microstate, total charge, RFE, RFE SEM, RFE model uncertainty, SMILES string of the predicted microstate +# +# The molecule ID of the other microstate needs to be in the format: `SMXX_extra001` (number can vary) +# Also email us the `.mol2` file of your microstate with explicit hydrogens and correct bond orders, +# Please send the `.mol2` file to the email listed on the instructions page. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +#ref,refid,charge,RFE,SEM,MOD +SM25_micro000,SM25_micro001,-1,-302.80,0.00,0.00 +SM26_micro000,SM26_micro001,-1,-304.26,0.00,0.00 +SM27_micro000,SM27_micro001,-1,-303.54,0.00,0.00 +SM28_micro000,SM28_micro001,0,2.44,0.00,0.00 +SM29_micro000,SM29_micro001,-1,-303.29,0.00,0.00 +SM30_micro000,SM30_micro001,-1,-315.28,0.00,0.00 +SM31_micro000,SM31_micro001,-1,-303.80,0.00,0.00 +SM32_micro000,SM32_micro001,-1,-298.44,0.00,0.00 +SM33_micro000,SM33_micro001,-1,-301.80,0.00,0.00 +SM34_micro000,SM34_micro001,-1,-302.30,0.00,0.00 +SM35_micro000,SM35_micro001,-1,-295.74,0.00,0.00 +SM36_micro000,SM36_micro001,-1,-297.05,0.00,0.00 +SM37_micro000,SM37_micro001,1,210.31,0.00,0.00 +SM38_micro000,SM38_micro001,-1,-296.78,0.00,0.00 +SM39_micro000,SM39_micro001,-1,-296.99,0.00,0.00 +SM40_micro000,SM40_micro001,-1,-296.92,0.00,0.00 +SM41_micro000,SM41_micro001,-1,-304.67,0.00,0.00 +SM42_micro000,SM42_micro001,-1,-302.42,0.00,0.00 +SM43_micro000,SM43_micro001,-1,-303.06,0.00,0.00 +SM44_micro000,SM44_micro001,-1,-303.54,0.00,0.00 +SM45_micro000,SM45_micro001,-1,-302.66,0.00,0.00 +SM46_micro000,SM46_micro001,-1,-304.30,0.00,0.00 +SM25_micro000,SM25_micro002,0,-2.33,0.00,0.00 +SM26_micro000,SM26_micro002,0,0.45,0.00,0.00 +SM28_micro000,SM28_micro002,-1,-297.34,0.00,0.00 +SM35_micro000,SM35_micro002,0,-0.13,0.00,0.00 +SM36_micro000,SM36_micro002,0,0.00,0.00,0.00 +SM37_micro000,SM37_micro002,-1,-299.50,0.00,0.00 +SM40_micro000,SM40_micro002,1,210.08,0.00,0.00 +SM41_micro000,SM41_micro002,1,217.75,0.00,0.00 +SM42_micro000,SM42_micro002,1,219.72,0.00,0.00 +SM43_micro000,SM43_micro002,1,218.66,0.00,0.00 +SM44_micro000,SM44_micro002,1,218.37,0.00,0.00 +SM45_micro000,SM45_micro002,1,219.23,0.00,0.00 +SM46_micro000,SM46_micro002,1,215.55,0.00,0.00 +SM25_micro000,SM25_micro003,-1,-302.63,0.00,0.00 +SM26_micro000,SM26_micro003,-1,-306.95,0.00,0.00 +SM28_micro000,SM28_micro003,1,206.83,0.00,0.00 +SM35_micro000,SM35_micro003,-1,-298.00,0.00,0.00 +SM36_micro000,SM36_micro003,-1,-297.05,0.00,0.00 +SM37_micro000,SM37_micro003,0,-2.58,0.00,0.00 +SM43_micro000,SM43_micro003,2,359.33,0.00,0.00 +SM46_micro000,SM46_micro003,2,356.00,0.00,0.00 +SM37_micro000,SM37_micro004,-1,-299.50,0.00,0.00 + + +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Michael Jones + +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +NIH NHLBI + +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +RFE-NHLBI-TZVP-QM + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +Gaussian 09/16 +OpenBabel +MOE 2018 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Physical (QM) + + +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +30 hours, CPU + + +# +# COMPUTING AND HARDWARE SECTION +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +All calculations were performed on a Biowulf and Lobos clusters at the National Institutes of Health. + + + + +# METHOD DESCRIPTION SECTION +Method: +Def2-TZVP basis sets were used for all calculations. +All calculations were performed in either Gaussian 09 or Gaussian 16. All challenge molecules were converted to 3d coordinates using OpenBabel from the SMILES string. Structures were then optimized with the B3LYP density functional and were verified to be local minima via frequency calculations on an ‘ultrafine’ integration grid with harmonic frequencies. Relative Free Energies (RFE) were determined by using the thermodynamic cycle to calculate the "reaction free energies" (i.e., HA-> A- + H+, including the proton free energy of solvation) in both the gas and aqueous phases. Following, the RFE was determined by determining the difference between the enumerated microstate's reaction and the reference microstate (i.e., ∆Gsolv(SMXX_micro001)-∆Gsolv(SMXX_micro000)). All RFE's are reported in kcal/more. + + +# +Ranked: +True diff --git a/physical_property/permeability/Analysis/SAMPL7-user-map-HG.csv b/physical_property/permeability/Analysis/SAMPL7-user-map-HG.csv new file mode 100644 index 00000000..f3846b35 --- /dev/null +++ b/physical_property/permeability/Analysis/SAMPL7-user-map-HG.csv @@ -0,0 +1,2 @@ +22,permeability-dddc-1.csv +24,permeability-dddc-2.csv diff --git a/physical_property/permeability/Analysis/Scripts/get_usermap.py b/physical_property/permeability/Analysis/Scripts/get_usermap.py new file mode 100644 index 00000000..53aa2a64 --- /dev/null +++ b/physical_property/permeability/Analysis/Scripts/get_usermap.py @@ -0,0 +1,16 @@ +#!/bin/env python + +outfile = '../SAMPL7-user-map-HG.csv' + +# Read user map from submission server +file = open('/Users/dmobley/github/SAMPL-submission-systems/SAMPL-submission-handling-shared/submissions/downloads/submission_table.txt', 'r') +text = file.readlines() +file.close() + +# Write output file, removing e-mail addresses +file = open(outfile, 'w') +for line in text: + tmp = line.split(',') + if 'PERMEABILITY' in tmp[2].upper(): + file.write(f'{tmp[0].strip()},{tmp[2].strip().replace(" ","_")}\n') +file.close() diff --git a/physical_property/permeability/Analysis/Submissions/permeability-dddc-1.csv b/physical_property/permeability/Analysis/Submissions/permeability-dddc-1.csv new file mode 100644 index 00000000..a2605a85 --- /dev/null +++ b/physical_property/permeability/Analysis/Submissions/permeability-dddc-1.csv @@ -0,0 +1,175 @@ +# PERMEABILITY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit permeability predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section- if you used +# the the challenge provided molecule (as found in the `SAMPL7_molecule_ID_and_SMILES.csv` file in the permeability repo) use the form `SMXX`, +# if you used a microstate that was provided in the pKa challenge please use that name here (e.g. SM26_micro000, SM26_micro001, etc), and +# if you used a microstate not provided in the sampl challenge you must use the form `SMXX_extraXXX` (where XXX can be any number). +# +# Please report apparent permeability coefficient (logPapp) standard error of the mean (SEM) and logPapp model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), logPapp, logPapp SEM, logPapp model uncertainty +# +# Your logPapp predictions do NOT have to use the challenge provided molecules in the `SAMPL7_molecule_ID_and_SMILES.csv` file. +# If you use a microstate other than the one listed in the challenge provided files, please fill out the `Molecule ID/IDs considered (no commas)` +# section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods +# description in the `METHOD DESCRIPTION SECTION`. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25,-5.70,0.15,0.64 +SM26,SM26,-6.12,0.31,0.64 +SM27,SM27,-5.72,0.49,0.64 +SM28,SM28,-5.32,0.19,0.64 +SM29,SM29,-5.55,0.43,0.64 +SM30,SM30,-5.38,0.25,0.64 +SM31,SM31,-5.77,0.75,0.64 +SM32,SM32,-5.50,0.48,0.64 +SM33,SM33,-5.43,0.28,0.64 +SM34,SM34,-5.65,0.77,0.64 +SM35,SM35,-5.76,0.70,0.64 +SM36,SM36,-5.65,0.32,0.64 +SM37,SM37,-5.78,0.67,0.64 +SM38,SM38,-6.32,0.68,0.64 +SM39,SM39,-6.21,0.47,0.64 +SM40,SM40,-6.14,0.66,0.64 +SM41,SM41,-5.56,0.02,0.64 +SM42,SM42,-5.11,0.32,0.64 +SM43,SM43,-5.42,0.16,0.64 +SM44,SM44,-6.54,0.05,0.64 +SM45,SM45,-5.88,0.22,0.64 +SM46,SM46,-6.16,0.10,0.64 + +# +# +# Please list your name, using only UTF-8 characters as described above. The "Participant name:" entry is required. +Participant name: +Xiaoyu Ding, Xutong Li + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Shanghai Institute of Materia Medica +University of Chinese Academy of Sciences + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The character limit for the name should not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +permeability-prediction-Attentive FP + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours, CPU +0.01 hours, GPU + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +Attentive FP was trained and performed on one NVIDIA Tesla V100 on a single machine +hosting an Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +pytorch 1.4.0 +torch-geometric 1.3.2 +theano 1.0.4 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates (that are not found in the separate pKa challenge), please report their SMILES strings +# and populations in this section. +# If you used a microstate other than the challenge provided microstate found in the `SAMPL7_molecule_ID_and_SMILES.csv` file, please +# list your chosen molecule ID (in the form of `SMXX_extraXXX`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Data Curation and Preparation +Data were collected from literature and databases. More detail are not shown. +The compound structures were cleaned and normalized by our in-house script. +If a compound had multiple permeabilities, the arithmetic means of all permeabilities as the final annotation, +unless the reported logPapp values covered a range of more than 0.5. +The final dataset consists of 2942 compounds. + +Model building +All compounds were randomly dividied into training and test set by a ratio of 4:1. +Five-fold cross-validation was performed within the training dataset during the training process. +In detail, training dataset was split into five folds +and each fold is then used onceas a validation, whereas the four remaining folds form the training set. +For each fold, early stopping was applied and the training process. +We set a maximum epoch of 500, and if the performance root mean squared error (RMSE) had not improved in 8 epochs on the training set and in 18 epochs on the validation set, +the training process was terminated early. +Attentive FP[1] that have previously reported by our group was applied to build prediction model. +The six hyper-parameters were set as following: k (the number of attentive layers for atom embedding) = 4, +t (the number of attentive layers for molecule embedding) = 4, +fingerprint dimension = 128, L2 weight decay = 10^-5, learning rate = 10^-5, and dropout rate = 0.4. +We performed three independent runs with different random seeds to train the model and obtain the standard error of the mean (SEM) as a measure of statistical uncertainty. +Model uncertainty is calculated as the RMSE(0.64) between predicted and experiment values for test set. + +[1] Xiong Z, Wang D, Liu X, et al. Pushing the Boundaries of Molecular Representation for Drug Discovery with the Graph Attention Mechanism[J]. Journal of Medicinal Chemistry, 2019. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True diff --git a/physical_property/permeability/Analysis/Submissions/permeability-dddc-2.csv b/physical_property/permeability/Analysis/Submissions/permeability-dddc-2.csv new file mode 100644 index 00000000..9d86e5a1 --- /dev/null +++ b/physical_property/permeability/Analysis/Submissions/permeability-dddc-2.csv @@ -0,0 +1,175 @@ +# PERMEABILITY PREDICTIONS +# +# This file will be automatically parsed. It must contain the following four elements: +# predictions, name of method, software listing, and method description. +# These elements must be provided in the order shown with their respective headers. +# +# Any line that begins with a # is considered a comment and will be ignored when parsing. +# +# PREDICTION SECTION +# +# It is mandatory to submit permeability predictions for all 22 molecules. +# Incomplete submissions will not be accepted. +# +# Please report the general molecule `ID tag` in the form of `SMXX` (e.g. SM25, SM26, etc). +# Please indicate the microstate(s) used in the `Molecule ID/IDs considered (no commas)` section- if you used +# the the challenge provided molecule (as found in the `SAMPL7_molecule_ID_and_SMILES.csv` file in the permeability repo) use the form `SMXX`, +# if you used a microstate that was provided in the pKa challenge please use that name here (e.g. SM26_micro000, SM26_micro001, etc), and +# if you used a microstate not provided in the sampl challenge you must use the form `SMXX_extraXXX` (where XXX can be any number). +# +# Please report apparent permeability coefficient (logPapp) standard error of the mean (SEM) and logPapp model uncertainty. +# +# The data in each prediction line should be structured as follows: +# ID tag, Molecule ID/IDs considered (no commas), logPapp, logPapp SEM, logPapp model uncertainty +# +# Your logPapp predictions do NOT have to use the challenge provided molecules in the `SAMPL7_molecule_ID_and_SMILES.csv` file. +# If you use a microstate other than the one listed in the challenge provided files, please fill out the `Molecule ID/IDs considered (no commas)` +# section using a molecule ID in the form of `SMXX_extra001` (number can vary) and please list the molecule ID and it's SMILES string in your methods +# description in the `METHOD DESCRIPTION SECTION`. +# +# Only one entry in the second column (`Molecule ID/IDs considered (no commas)`) is required, but you should list all IDs considered/input to your calculations. See challenge instructions. +# +# If you have evaluated additional microstates then the molecule ID used in the `Molecule ID/IDs considered (no commas)` section needs to be in the format: `SMXX_extra001` (number can vary). +# If multiple microstates are used, please report the order of population in the aqueous phase in descending order. +# Please list microstate populations, SMILES strings and the molecule IDs in the `METHOD DESCRIPTION SECTION` section further below. +# +# The list of predictions must begin with the 'Predictions:' keyword as illustrated here. +Predictions: +SM25,SM25,-5.38,1.02,0.62 +SM26,SM26,-5.32,1.09,0.62 +SM27,SM27,-5.25,1.12,0.62 +SM28,SM28,-5.24,1.15,0.62 +SM29,SM29,-5.23,1.14,0.62 +SM30,SM30,-5.22,1.12,0.62 +SM31,SM31,-5.25,1.17,0.62 +SM32,SM32,-5.19,1.14,0.62 +SM33,SM33,-5.16,1.12,0.62 +SM34,SM34,-5.25,1.13,0.62 +SM35,SM35,-5.36,1.11,0.62 +SM36,SM36,-5.39,1.06,0.62 +SM37,SM37,-5.42,1.11,0.62 +SM38,SM38,-5.40,1.08,0.62 +SM39,SM39,-5.37,1.07,0.62 +SM40,SM40,-5.42,1.11,0.62 +SM41,SM41,-5.35,1.07,0.62 +SM42,SM42,-5.39,1.00,0.62 +SM43,SM43,-5.30,1.16,0.62 +SM44,SM44,-5.34,1.09,0.62 +SM45,SM45,-5.39,1.01,0.62 +SM46,SM46,-5.30,1.17,0.62 + +# +# +# Please list your name," using only UTF-8 characters as described above. The ""Participant name:"" entry is required." +Participant name: +Xiaoyu Ding, Xutong Li + +# +# +# Please list your organization/affiliation, using only UTF-8 characters as described above. +Participant organization: +Shanghai Institute of Materia Medica +University of Chinese Academy of Sciences + +# +# +# NAME SECTION +# +# Please provide an informal but informative name of the method used. +# The name must not exceed 40 characters. +# The 'Name:' keyword is required as shown here. +Name: +GROVER + +# +# +# COMPUTE TIME SECTION +# +# Please provide the average compute time across all of the molecules. +# For physical methods, report the GPU and/or CPU compute time in hours. +# For empirical methods, report the query time in hours. +# Create a new line for each processor type. +# The 'Compute time:' keyword is required as shown here. +Compute time: +0.01 hours + +# +# COMPUTING AND HARDWARE SECTION +# +# Please provide details of the computing resources that were used to train models and make predictions. +# Please specify compute time for training models and querying separately for empirical prediction methods. +# Provide a detailed description of the hardware used to run the simulations. +# The 'Computing and hardware:' keyword is required as shown here. +Computing and hardware: +GROVER was trained and performed on one NVIDIA P40 on a single machine +hosting an Intel(R) Xeon(R) Gold 6136 CPU @ 3.00GHz. + + +# SOFTWARE SECTION +# +# List all major software packages used and their versions. +# Create a new line for each software. +# The 'Software:' keyword is required. +Software: +pytorch1.1 +python3.6 +rdkit2019.03.3.0 + +# METHOD CATEGORY SECTION +# +# State which method category your prediction method is better described as: +# `Physical (MM)`, `Physical (QM)`, `Empirical`, or `Mixed`. +# Pick only one category label. +# The `Category:` keyword is required. +Category: +Empirical + +# METHOD DESCRIPTION SECTION +# +# Methodology and computational details. +# Level of details should be roughly equivalent to that used in a publication. +# Please include the values of key parameters with units. +# Please explain how statistical uncertainties were estimated. +# +# If you have evaluated additional microstates, please report their SMILES strings and populations of all the microstates in this section. +# If you used a microstate other than the challenge provided microstate (`SMXX_micro000`), please list your chosen `Molecule ID` (in the form of `SMXX_extra001`) along with the SMILES string in your methods description. +# +# Use as many lines of text as you need. +# All text following the 'Method:' keyword will be regarded as part of your free text methods description. +Method: +Data Curation and Preparation +Data were collected from literature and databases. More detail are not shown. +The compound structures were cleaned and normalized by our in-house script. +If a compound had multiple logP, the arithmetic means of all values as the final annotation, +unless the reported logP values covered a range of more than 0.5. +Since the number of records is too large, random samping was performed on the full datset. +About half of the compounds were randomly selected for model buiding within the limited time. +The final dataset consists of 2942 compounds. + +Model building +All compounds were randomly dividied into training and test set by a ratio of 4:1. +Five-fold cross-validation was performed within the training dataset during the training process,5 models were build for each fold with different initial parameters for ensemble. +In detail, training dataset was split into five folds +and each fold is then used once as a validation, whereas the four remaining folds form the training set. +Five models were build for each fold with different initial parameters for ensemble. +For each fold, early stopping was applied and the training process. +We set a maximum epoch of 80, and if the performance root mean squared error (RMSE) had not improved in 10 epochs on the validation set, +the training process was terminated early. +GROVER[1] that have previously reported by Rong et al. was applied to build prediction model. +The six hyper-parameters were set as following: +batch_size = 32, +attn_hidden = 16, L2 weight decay = 10^-7, learning rate = 10^-5, and dropout rate = 0. +Finally, the predictions of logP were converted into transfer free energy (TFE) through multiplying by -1.364. +We performed five independent runs with different random seeds to train the model and obtain the standard error of the mean (SEM) as a measure of statistical uncertainty. +Model uncertainty is calculated as the RMSE between predicted and experiment values for test set. +[1] Rong Y, Bian Y, Xu T, et al. GROVER: Self-supervised Message Passing Transformer on Large-scale Molecular Data[J]. arXiv preprint arXiv:2007.02835, 2020. + +# +# +# All submissions must either be ranked or non-ranked. +# Only one ranked submission per participant is allowed. +# Multiple ranked submissions from the same participant will not be judged. +# Non-ranked submissions are accepted so we can verify that they were made before the deadline. +# The "Ranked:" keyword is required, and expects a Boolean value (True/False) +Ranked: +True From 7396b811f1ca8ba6750f4adb25e57a9f7cc5354c Mon Sep 17 00:00:00 2001 From: David Mobley Date: Sat, 10 Oct 2020 17:07:27 -0700 Subject: [PATCH 2/3] update changelogs/manifests --- README.md | 1 + physical_property/logP/README.md | 1 + physical_property/pKa/README.md | 1 + physical_property/permeability/README.md | 1 + 4 files changed, 4 insertions(+) diff --git a/README.md b/README.md index c6867d0a..b0d1607e 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ The SAMPL7 physical property challenge is now open! All three host-guest challen - **Added additional microstates for pKa challenge**, from Bogdan Iorga (Sept. 30, 2020). Updated instructions to clarify that any states not included in pKa predictions will be assumed to be unpopulated (so participants can omit these states). Updated pKa instructions/template to allow optional submission of macro pKa values. - **Note that experiments used specified chirality for certain physical property compounds**, `SM35`, `SM36` and `SM37`. So only the structures with specified chirality for these compounds should be used. - **Add SAMPL7 physical properties experimental values** (Oct. 10, 2020). +- Add SAMPL7 physical properties submissions (Oct. 10, 2020) ## Challenge overview diff --git a/physical_property/logP/README.md b/physical_property/logP/README.md index 43b2d71d..1a514423 100644 --- a/physical_property/logP/README.md +++ b/physical_property/logP/README.md @@ -17,3 +17,4 @@ Experimental log *P* measurements will made available after the challenge deadli - [`submission_template/logP_prediction_template.csv`](submission_template/logP_prediction_template.csv) - An empty prediction submission template file. - [`example_submission_file/logP-DanielleBergazinExampleFile-1.csv`](example_submission_file/logP-DanielleBergazinExampleFile-1.csv) - An example submission file filled with random values to illustrate expected format. - [`logP_challenge_instructions.md`](logP_challenge_instructions.md) - Instructions for the log *P* challenge. +- `Analysis`: Contains submissions, and will eventually contain results of analysis. diff --git a/physical_property/pKa/README.md b/physical_property/pKa/README.md index 4f5e502b..ebbc99d5 100644 --- a/physical_property/pKa/README.md +++ b/physical_property/pKa/README.md @@ -16,3 +16,4 @@ Experimental pKa measurements will made available after the challenge - [`example_submission_file/pKa-DanielleBergazinExampleFile-1.csv`](example_submission_file/pKa-DanielleBergazinExampleFile-1.csv) - An example submission file filled with random values to illustrate expected format. - [`pKa_challenge_instructions.md`](pKa_challenge_instructions.md) - Instructions for the pKa challenge. - [`transition_networks/`](transition_networks/) - This directory contains transition networks of the challenge molecules in `.PDF` and `.PPTX` format. +- `Analysis`: Contains submissions, and will eventually contain results of analysis. diff --git a/physical_property/permeability/README.md b/physical_property/permeability/README.md index b6179bad..5ab3b842 100644 --- a/physical_property/permeability/README.md +++ b/physical_property/permeability/README.md @@ -17,3 +17,4 @@ Experimental permeability data will made available after the challenge deadline. - [`submission_template/permeability_prediction_template.csv`](submission_template/permeability_prediction_template.csv) - An empty prediction submission template file. - [`example_submission_file/permeability-DanielleBergazinExampleFile-1.csv`](example_submission_file/permeability-DanielleBergazinExampleFile-1.csv) - An example submission file filled with random values to illustrate expected format. - [`permeability_challenge_instructions.md`](permeability_challenge_instructions.md) - Instructions for permeability challenge. +- `Analysis`: Contains submissions, and will eventually contain results of analysis. From fb86aa9fb39fbfe9517eab589e218c564544fabe Mon Sep 17 00:00:00 2001 From: David Mobley Date: Sat, 10 Oct 2020 17:07:37 -0700 Subject: [PATCH 3/3] Update manifest --- physical_property/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/physical_property/README.md b/physical_property/README.md index 9410fadd..b15a1f93 100644 --- a/physical_property/README.md +++ b/physical_property/README.md @@ -40,9 +40,9 @@ Effective permeability (log*P**app*) was measured by PAMPA ## What's here - [`SAMPL7_molecule_ID_and_SMILES.csv`](SAMPL7_molecule_ID_and_SMILES.csv): A `.CSV` file containing SAMPL7 challenge molecule IDs and SMILES. SMILES were provided by the [Ballatore lab](https://pharmacy.ucsd.edu/faculty/ballatore). -- [`logP/`](logP/): Folder contains an input file in `.CSV` format with SMILES strings of the neutral states of the molecules. This folder contains instructions and a submission template for the logP challenge. -- [`pKa/`](pKa/): Folder contains challenge input files in `.CSV` format with SMILES of enumerated microstates. `.MOL2` and `.SDF` files of each microstate are also provided. This folder contains instructions and a submission template for the pKa challenge. Microstates (tautomers and protomers) were generated with a notebook wich uses RDKit and OpenEye tools. Additional microstates were enumerated using Chemicalize (Chemaxon) and Epik (Schrodinger) and added to the notebook generated `.CSV` files. -- [`permeability/`](permeability/): Folder contains input files in `.CSV` format with SMILES strings of molecules. This folder contains instructions and a submission template for the permeability challenge. +- [`logP/`](logP/): Folder contains an input file in `.CSV` format with SMILES strings of the neutral states of the molecules. This folder contains instructions and a submission template for the logP challenge. Also contains submission files for submitted predictions. +- [`pKa/`](pKa/): Folder contains challenge input files in `.CSV` format with SMILES of enumerated microstates. `.MOL2` and `.SDF` files of each microstate are also provided. This folder contains instructions and a submission template for the pKa challenge. Microstates (tautomers and protomers) were generated with a notebook wich uses RDKit and OpenEye tools. Additional microstates were enumerated using Chemicalize (Chemaxon) and Epik (Schrodinger) and added to the notebook generated `.CSV` files. Also contains submission files for submitted predictions. +- [`permeability/`](permeability/): Folder contains input files in `.CSV` format with SMILES strings of molecules. This folder contains instructions and a submission template for the permeability challenge. Also contains submission files for submitted predictions. - [`images/`](images): Folder containing images related to this challenge in PDF and/or JPEG format. - [`experimental_data/`](experimental_data/): Folder will contain experimental measurements of pKa, partitioning, and permeability values after the SAMPL7 challenge submission deadline.