From 9e5ab3922f53ae41e5e923d2156f4faef69c7e31 Mon Sep 17 00:00:00 2001 From: Adrian Mirza Date: Mon, 16 Oct 2023 12:22:20 +0200 Subject: [PATCH] Add train-test split for tabular data (#418) * feat: add scaffold split * feat: add file rewriting and merge on common smiles * feat: removed the logic for the other representations * fix: reference * fix: typo * feat: fix minor details and add docstring * fix: docstrings * feat: add simple test script * a bit of polish for the train/test split * add docstring * only work on files with SMILES col * change rename behavior * remove lint * lint * do not track dev notebook * pin pydantic yaml --------- Co-authored-by: Kevin Maik Jablonka --- .pre-commit-config.yaml | 2 +- .../mattermodeling_stackexchange/transform.py | 4 +- .../develop_transform.ipynb | 0 data/tabular/peptides_hemolytic/meta.yaml | 285 +++++++++--------- .../physics_stackexchange/transform.py | 4 +- data/tabular/train_test_split.py | 234 ++++++++++++++ pyproject.toml | 2 +- 7 files changed, 377 insertions(+), 154 deletions(-) delete mode 100644 data/tabular/orbnet_denali_training/develop_transform.ipynb create mode 100644 data/tabular/train_test_split.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index f2a1169a9..63e6b64e0 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -6,7 +6,7 @@ ci: repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.4.0 + rev: v4.5.0 hooks: - id: check-json - id: check-yaml diff --git a/data/tabular/mattermodeling_stackexchange/transform.py b/data/tabular/mattermodeling_stackexchange/transform.py index 792da324b..1f665833b 100644 --- a/data/tabular/mattermodeling_stackexchange/transform.py +++ b/data/tabular/mattermodeling_stackexchange/transform.py @@ -1,5 +1,5 @@ -from datasets import load_dataset import pandas as pd +from datasets import load_dataset def remove_repeated_almost_empty_lines(text): @@ -31,7 +31,7 @@ def get_clean_df(): # the answers are in an array of arrays, the first element is the answer, the second is the score # we then also only keep two columns, the question and the answer, both as string on which we also # call the strip function to remove leading and trailing whitespaces - for i, row in df.iterrows(): + for _i, row in df.iterrows(): # skip question with markdown image tag in it if "![" in row["question_text"]: continue diff --git a/data/tabular/orbnet_denali_training/develop_transform.ipynb b/data/tabular/orbnet_denali_training/develop_transform.ipynb deleted file mode 100644 index e69de29bb..000000000 diff --git a/data/tabular/peptides_hemolytic/meta.yaml b/data/tabular/peptides_hemolytic/meta.yaml index 8df0e0099..1a882fdda 100644 --- a/data/tabular/peptides_hemolytic/meta.yaml +++ b/data/tabular/peptides_hemolytic/meta.yaml @@ -1,157 +1,146 @@ +--- name: peptides_hemolytic -description: "Hemolysis is referred to the disruption of erythrocyte\nmembranes that\ - \ decrease the life span of red blood cells and causes\nthe release of Hemoglobin.\ - \ It is critical to identify non-hemolytic\nantimicrobial peptides as a non-toxic\ - \ and safe measure against bacterial\ninfections. However, distinguishing between\ - \ hemolytic and non-hemolytic\npeptides is a challenge, since they primarily exert\ - \ their activity at the\ncharged surface of the bacterial plasma membrane.\nThe\ - \ data here comes from the Database of Antimicrobial Activity and Structure of\n\ - Peptides (DBAASP v3). Hemolytic activity is defined by extrapolating a measurement\n\ - assuming dose response curves to the point\nat which 50% of red blood cells are\ - \ lysed. Activities below 100 mu g/ml, are\nconsidered hemolytic.\nThe data contains\ - \ sequences of only L- and canonical amino acids. Each measurement\nis treated independently,\ - \ so sequences can appear multiple times. This experimental\ndataset contains noise,\ - \ and in some observations (40%), an identical sequence appears\nin both negative\ - \ and positive class. As an example, sequence \"RVKRVWPLVIRTVIAGYNLYRAIKKK\"\nis\ - \ found to be both hemolytic and\nnon-hemolytic in two different lab experiments\ - \ (i.e. two different training examples). " +description: "Hemolysis is referred to the disruption of erythrocyte\nmembranes that decrease the life span of red blood cells and causes\nthe release of\ + \ Hemoglobin. It is critical to identify non-hemolytic\nantimicrobial peptides as a non-toxic and safe measure against bacterial\ninfections. However,\ + \ distinguishing between hemolytic and non-hemolytic\npeptides is a challenge, since they primarily exert their activity at the\ncharged surface of\ + \ the bacterial plasma membrane.\nThe data here comes from the Database of Antimicrobial Activity and Structure of\nPeptides (DBAASP v3). Hemolytic\ + \ activity is defined by extrapolating a measurement\nassuming dose response curves to the point\nat which 50% of red blood cells are lysed. Activities\ + \ below 100 mu g/ml, are\nconsidered hemolytic.\nThe data contains sequences of only L- and canonical amino acids. Each measurement\nis treated independently,\ + \ so sequences can appear multiple times. This experimental\ndataset contains noise, and in some observations (40%), an identical sequence appears\n\ + in both negative and positive class. As an example, sequence \"RVKRVWPLVIRTVIAGYNLYRAIKKK\"\nis found to be both hemolytic and\nnon-hemolytic in two\ + \ different lab experiments (i.e. two different training examples). " targets: -- id: hemolytic - description: The ability of a peptide sequence to lyse red blood cells (1) or not - (0). - units: null - type: boolean - names: - - noun: hemolytic activity - - noun: hemolysis - - verb: lyse red blood cells - - adjective: hemolytic - - gerund: lysing red blood cells - uris: null + - id: hemolytic + description: The ability of a peptide sequence to lyse red blood cells (1) or not (0). + units: + type: boolean + names: + - noun: hemolytic activity + - noun: hemolysis + - verb: lyse red blood cells + - adjective: hemolytic + - gerund: lysing red blood cells + uris: benchmarks: [] identifiers: -- id: sequence - type: Other - description: amino acid sequence + - id: sequence + type: Other + description: amino acid sequence license: CC BY 4.0 links: -- url: https://doi.org/10.1021/acs.jcim.2c01317 - description: corresponding publication -- url: https://doi.org/10.1093/nar/gkaa991 - description: data source + - url: https://doi.org/10.1021/acs.jcim.2c01317 + description: corresponding publication + - url: https://doi.org/10.1093/nar/gkaa991 + description: data source num_points: 6541 bibtex: -- |- - @article{Martins2012, - doi = {10.1021/ci300124c}, - url = {https://doi.org/10.1021/ci300124c}, - year = {2012}, - month = jun, - publisher = {American Chemical Society (ACS)}, - volume = {52}, - number = {6}, - pages = {1686--1697}, - author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro - and Andre O. Falcao}, - title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling}, - journal = {Journal of Chemical Information and Modeling} -- |- - @article{Wu2018, - doi = {10.1039/c7sc02664a}, - url = {https://doi.org/10.1039/c7sc02664a}, - year = {2018}, - publisher = {Royal Society of Chemistry (RSC)}, - volume = {9}, - number = {2}, - pages = {513--530}, - author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph - Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, - title = {MoleculeNet: a benchmark for molecular machine learning}, - journal = {Chemical Science} + - |- + @article{Martins2012, + doi = {10.1021/ci300124c}, + url = {https://doi.org/10.1021/ci300124c}, + year = {2012}, + month = jun, + publisher = {American Chemical Society (ACS)}, + volume = {52}, + number = {6}, + pages = {1686--1697}, + author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro + and Andre O. Falcao}, + title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling}, + journal = {Journal of Chemical Information and Modeling} + - |- + @article{Wu2018, + doi = {10.1039/c7sc02664a}, + url = {https://doi.org/10.1039/c7sc02664a}, + year = {2018}, + publisher = {Royal Society of Chemistry (RSC)}, + volume = {9}, + number = {2}, + pages = {513--530}, + author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph + Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, + title = {MoleculeNet: a benchmark for molecular machine learning}, + journal = {Chemical Science} templates: -- The sequence of {#amino acids|AAs!} {sequence#} {#shows|exhibits|demonstrates!} {hemolytic#no - &NULL}{hemolytic__names__adjective} properties. -- The amino acid sequence {sequence#} {#shows|exhibits|displays!} {hemolytic#no &NULL}{hemolytic__names__adjective} - properties. -- Based on the {#amino acid sequence |sequence of amino acids !}{sequence#}, the peptide has {hemolytic#no &NULL}{hemolytic__names__adjective} {#properties|characteristics|features!}. -- The {sequence__description} {sequence#} {#represents|is from!} a peptide - that is {hemolytic#not &NULL}identified as {hemolytic__names__adjective}. -- The {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic#not - &NULL}{hemolytic__names__adjective}. -- |- - Task: Please classify a peptide based on the description. - Description: A amino acid sequence that is {hemolytic__names__adjective}. - {#amino acid sequence |sequence of amino acids!}: {sequence#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result: {hemolytic#False&True} -- |- - Task: Please classify a amino acid sequence based on the description. - Description: A amino acid sequence that is {hemolytic__names__adjective}. - {#amino acid sequence |sequence of amino acids !}: {sequence#} - Constraint: Answer the question in a {#full|complete!} sentence. - Result: This amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}. -- |- - Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids!} based on the {#text |!}description{# below|!}. - Description: A amino acid sequence that is {hemolytic__names__adjective}. - Result: {sequence#} -- |- - User: Can you {#tell me|derive|estimate!} if the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic__names__adjective}? - Assistant: {hemolytic#No&Yes}, this amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}. -- |- - User: Is the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} {hemolytic__names__adjective}? - Assistant: {hemolytic#No&Yes}, it is {hemolytic#not &NULL}{hemolytic__names__adjective}. -- |- - User: Can you {#give me|create|generate!} the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {sequence#} -- |- - User: I'm {#searching|looking!} for the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}? - Assistant: This is a amino acid sequence that is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} -- |- - User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? - User: Yes, please. The amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}. - Assistant: {#Ok|Got it!},{# here you go,|!} this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} -- |- - User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}? - User: Yes, the amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}. - Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} -- Is the {sequence__description} {sequence#} {hemolytic__names__adjective}: {hemolytic#no&yes} -- |- - Task: Please classify a {#amino acid sequence|sequence of amino acids|peptide!} based on the description. - Description: A amino acid sequence that is {hemolytic__names__adjective}. - {#amino acid sequence|sequence of amino acids!}: {sequence#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result: {hemolytic#False&True} -- |- - Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!} based on the {#text |!}description{# below|!}. - Description: A {#amino acid sequence|sequence of amino acids|peptide!} that is {hemolytic__names__adjective}. - Result: {sequence#} -- |- - Task: Please answer the multiple choice question. - Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {hemolytic%} - Answer: {%multiple_choice_result} -- |- - Task: Please answer the multiple choice question. - Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {hemolytic%} - Answer: {%multiple_choice_result} -- |- - Task: Please answer the multiple choice question. - Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {sequence%hemolytic%} - Answer: {%multiple_choice_result} -- |- - Task: Please answer the multiple choice question. - Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {sequence%hemolytic%} - Answer: {%multiple_choice_result} + - The sequence of {#amino acids|AAs!} {sequence#} {#shows|exhibits|demonstrates!} {hemolytic#no &NULL}{hemolytic__names__adjective} properties. + - The amino acid sequence {sequence#} {#shows|exhibits|displays!} {hemolytic#no &NULL}{hemolytic__names__adjective} properties. + - Based on the {#amino acid sequence |sequence of amino acids !}{sequence#}, the peptide has {hemolytic#no &NULL}{hemolytic__names__adjective} {#properties|characteristics|features!}. + - The {sequence__description} {sequence#} {#represents|is from!} a peptide that is {hemolytic#not &NULL}identified as {hemolytic__names__adjective}. + - The {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic#not &NULL}{hemolytic__names__adjective}. + - |- + Task: Please classify a peptide based on the description. + Description: A amino acid sequence that is {hemolytic__names__adjective}. + {#amino acid sequence |sequence of amino acids!}: {sequence#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {hemolytic#False&True} + - |- + Task: Please classify a amino acid sequence based on the description. + Description: A amino acid sequence that is {hemolytic__names__adjective}. + {#amino acid sequence |sequence of amino acids !}: {sequence#} + Constraint: Answer the question in a {#full|complete!} sentence. + Result: This amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}. + - |- + Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids!} based on the {#text |!}description{# below|!}. + Description: A amino acid sequence that is {hemolytic__names__adjective}. + Result: {sequence#} + - |- + User: Can you {#tell me|derive|estimate!} if the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic__names__adjective}? + Assistant: {hemolytic#No&Yes}, this amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}. + - |- + User: Is the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} {hemolytic__names__adjective}? + Assistant: {hemolytic#No&Yes}, it is {hemolytic#not &NULL}{hemolytic__names__adjective}. + - |- + User: Can you {#give me|create|generate!} the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}? + Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {sequence#} + - |- + User: I'm {#searching|looking!} for the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}? + Assistant: This is a amino acid sequence that is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} + - |- + User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? + User: Yes, please. The amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}. + Assistant: {#Ok|Got it!},{# here you go,|!} this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} + - |- + User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}. + Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}? + User: Yes, the amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}. + Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#} + - Is the {sequence__description} {sequence#} {hemolytic__names__adjective}: {hemolytic#no&yes} + - |- + Task: Please classify a {#amino acid sequence|sequence of amino acids|peptide!} based on the description. + Description: A amino acid sequence that is {hemolytic__names__adjective}. + {#amino acid sequence|sequence of amino acids!}: {sequence#} + Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. + Result: {hemolytic#False&True} + - |- + Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!} based on the {#text |!}description{# below|!}. + Description: A {#amino acid sequence|sequence of amino acids|peptide!} that is {hemolytic__names__adjective}. + Result: {sequence#} + - |- + Task: Please answer the multiple choice question. + Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {hemolytic%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}? + Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. + Options: + {hemolytic%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {sequence%hemolytic%} + Answer: {%multiple_choice_result} + - |- + Task: Please answer the multiple choice question. + Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}? + Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. + Options: + {sequence%hemolytic%} + Answer: {%multiple_choice_result} diff --git a/data/tabular/physics_stackexchange/transform.py b/data/tabular/physics_stackexchange/transform.py index e5a95243e..a392709a4 100644 --- a/data/tabular/physics_stackexchange/transform.py +++ b/data/tabular/physics_stackexchange/transform.py @@ -1,5 +1,5 @@ -from datasets import load_dataset import pandas as pd +from datasets import load_dataset def remove_repeated_almost_empty_lines(text): @@ -31,7 +31,7 @@ def get_clean_df(): # the answers are in an array of arrays, the first element is the answer, the second is the score # we then also only keep two columns, the question and the answer, both as string on which we also # call the strip function to remove leading and trailing whitespaces - for i, row in df.iterrows(): + for _i, row in df.iterrows(): # skip question with markdown image tag in it if "![" in row["question_text"]: continue diff --git a/data/tabular/train_test_split.py b/data/tabular/train_test_split.py new file mode 100644 index 000000000..ad19faba6 --- /dev/null +++ b/data/tabular/train_test_split.py @@ -0,0 +1,234 @@ +"""Perform scaffold split on all datasets and rewrite data_clean.csv files. + +Scaffold split is a method of splitting data that ensures that the same scaffold +is not present in both the train and test sets. This is important for evaluating +the generalizability of a model. + +For more information, see: + - Wu, Z.; Ramsundar, B.; Feinberg, E. N.; Gomes, J.; Geniesse, C.; Pappu, A. S.; + Leswing, K.; Pande, V. MoleculeNet: A Benchmark for Molecular Machine Learning. + Chemical Science 2018, 9 (2), 513–530. https://doi.org/10.1039/c7sc02664a. + - Jablonka, K. M.; Rosen, A. S.; Krishnapriyan, A. S.; Smit, B. + An Ecosystem for Digital Reticular Chemistry. ACS Central Science 2023, 9 (4), 563–581. + https://doi.org/10.1021/acscentsci.2c01177. + +""" +import os +import sys +from collections import defaultdict +from glob import glob +from random import Random +from typing import Dict, List + +import fire +import pandas as pd +from rdkit import Chem, RDLogger +from rdkit.Chem.Scaffolds import MurckoScaffold +from tqdm import tqdm + +RDLogger.DisableLog("rdApp.*") + + +def print_sys(s): + """system print + + Args: + s (str): the string to print + """ + print(s, flush=True, file=sys.stderr) + + +def create_scaffold_split( + df: pd.DataFrame, seed: int, frac: List[float], entity: str = "SMILES" +) -> Dict[str, pd.DataFrame]: + """create scaffold split. it first generates molecular scaffold for each molecule + and then split based on scaffolds + adapted from: https://github.com/mims-harvard/TDC/tdc/utils/split.py + + Args: + df (pd.DataFrame): dataset dataframe + fold_seed (int): the random seed + frac (list): a list of train/valid/test fractions + entity (str): the column name for where molecule stores + + Returns: + dict: a dictionary of splitted dataframes, where keys are train/valid/test + and values correspond to each dataframe + """ + random = Random(seed) + + s = df[entity].values + scaffolds = defaultdict(set) + + error_smiles = 0 + for i, smiles in tqdm(enumerate(s), total=len(s)): + try: + scaffold = MurckoScaffold.MurckoScaffoldSmiles( + mol=Chem.MolFromSmiles(smiles), includeChirality=False + ) + scaffolds[scaffold].add(i) + except Exception: + print_sys(smiles + " returns RDKit error and is thus omitted...") + error_smiles += 1 + + train, val, test = [], [], [] + train_size = int((len(df) - error_smiles) * frac[0]) + val_size = int((len(df) - error_smiles) * frac[1]) + test_size = (len(df) - error_smiles) - train_size - val_size + train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0 + + # index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True) + index_sets = list(scaffolds.values()) + big_index_sets = [] + small_index_sets = [] + for index_set in index_sets: + if len(index_set) > val_size / 2 or len(index_set) > test_size / 2: + big_index_sets.append(index_set) + else: + small_index_sets.append(index_set) + random.seed(seed) + random.shuffle(big_index_sets) + random.shuffle(small_index_sets) + index_sets = big_index_sets + small_index_sets + + if frac[2] == 0: + for index_set in index_sets: + if len(train) + len(index_set) <= train_size: + train += index_set + train_scaffold_count += 1 + else: + val += index_set + val_scaffold_count += 1 + else: + for index_set in index_sets: + if len(train) + len(index_set) <= train_size: + train += index_set + train_scaffold_count += 1 + elif len(val) + len(index_set) <= val_size: + val += index_set + val_scaffold_count += 1 + else: + test += index_set + test_scaffold_count += 1 + + return { + "train": df.iloc[train].reset_index(drop=True), + "valid": df.iloc[val].reset_index(drop=True), + "test": df.iloc[test].reset_index(drop=True), + } + + +def rewrite_data_with_splits( + csv_paths: List[str], + train_test_df: pd.DataFrame, + override: bool = False, + check: bool = True, + repr_col: str = "SMILES", +) -> None: + """Rewrite dataframes with the correct split column + + Args: + csv_paths (List[str]): list of files to merge (data_clean.csv) + train_test_df (pd.DataFrame): dataframe containing merged SMILES representations + from all datasets uniquely split into train and test + override (bool): whether to override the existing data_clean.csv files + defaults to False + check (bool): whether to check if the split was successful + defaults to True. Can be turned off to save memory + repr_col (str): the column name for where SMILES representation is stored + defaults to "SMILES" + """ + if check: + train_smiles = set(train_test_df.query("split == 'train'")["SMILES"].to_list()) + + for path in csv_paths: + read_dataset = pd.read_csv(path) + if repr_col in read_dataset.columns: + try: + read_dataset = read_dataset.drop("split", axis=1) + message = f"Split column found in {path}." + if override: + message += " Overriding..." + print(message) + except KeyError: + print(f"No split column in {path}") + + col_to_merge = "SMILES" + merged_data = pd.merge( + read_dataset, train_test_df, on=col_to_merge, how="left" + ) + merged_data = merged_data.dropna() + if override: + merged_data.to_csv(path, index=False) + else: + # rename the old data_clean.csv file to data_clean_old.csv + os.rename(path, path.replace(".csv", "_old.csv")) + # write the new data_clean.csv file + merged_data.to_csv(path, index=False) + + if len(merged_data.query("split == 'train'")) == 0: + raise ValueError("Split failed, no train data") + if len(merged_data.query("split == 'test'")) == 0: + raise ValueError("Split failed, no test data") + if check: + test_split_smiles = set( + merged_data.query("split == 'test'")["SMILES"].to_list() + ) + if len(train_smiles.intersection(test_split_smiles)) > 0: + raise ValueError("Split failed, train and test overlap") + else: + print(f"Skipping {path} as it does not contain {repr_col} column") + + +def cli( + seed: int = 42, + train_size: float = 0.8, + val_size: float = 0.0, + test_size: float = 0.2, + path: str = "*/data_clean.csv", + override: bool = False, + check: bool = True, + repr_col: str = "SMILES", +): + paths_to_data = glob(path) + filtered_paths = [] + for path in paths_to_data: + if "flashpoint" in path: + filtered_paths.append(path) + elif "freesolv" in path: + filtered_paths.append(path) + elif "peptide" in path: + filtered_paths.append(path) + paths_to_data = filtered_paths + + REPRESENTATION_LIST = [] + + for path in tqdm(paths_to_data): + df = pd.read_csv(path) + if repr_col in df.columns: + REPRESENTATION_LIST.extend(df[repr_col].to_list()) + + REPR_DF = pd.DataFrame() + REPR_DF["SMILES"] = list(set(REPRESENTATION_LIST)) + + scaffold_split = create_scaffold_split( + REPR_DF, seed=seed, frac=[train_size, val_size, test_size] + ) + + # create train and test dataframes + train_df = scaffold_split["train"] + test_df = scaffold_split["test"] + # add split columns to train and test dataframes + train_df["split"] = len(train_df) * ["train"] + test_df["split"] = len(test_df) * ["test"] + + # merge train and test across all datasets + merge = pd.concat([train_df, test_df], axis=0) + # rewrite data_clean.csv for each dataset + rewrite_data_with_splits( + paths_to_data, merge, override=override, check=check, repr_col=repr_col + ) + + +if __name__ == "__main__": + fire.Fire(cli) diff --git a/pyproject.toml b/pyproject.toml index 6dc7cd884..b107ccd5e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ dev = [ "flake8>=3.8.3", "isort>=5.0.0", "pre-commit", - "pydantic_yaml", + "pydantic_yaml<=0.11.2", "pytest", "pubchempy" ]