From 9e5ab3922f53ae41e5e923d2156f4faef69c7e31 Mon Sep 17 00:00:00 2001
From: Adrian Mirza <adrianmirza81@gmail.com>
Date: Mon, 16 Oct 2023 12:22:20 +0200
Subject: [PATCH] Add train-test split for tabular data (#418)

* feat: add scaffold split

* feat: add file rewriting and merge on common smiles

* feat: removed the logic for the other representations

* fix: reference

* fix: typo

* feat: fix minor details and add docstring

* fix: docstrings

* feat: add simple test script

* a bit of polish for the train/test split

* add docstring

* only work on files with SMILES col

* change rename behavior

* remove lint

* lint

* do not track dev notebook

* pin pydantic yaml

---------

Co-authored-by: Kevin Maik Jablonka <kevin.jablonka@epfl.ch>
---
 .pre-commit-config.yaml                       |   2 +-
 .../mattermodeling_stackexchange/transform.py |   4 +-
 .../develop_transform.ipynb                   |   0
 data/tabular/peptides_hemolytic/meta.yaml     | 285 +++++++++---------
 .../physics_stackexchange/transform.py        |   4 +-
 data/tabular/train_test_split.py              | 234 ++++++++++++++
 pyproject.toml                                |   2 +-
 7 files changed, 377 insertions(+), 154 deletions(-)
 delete mode 100644 data/tabular/orbnet_denali_training/develop_transform.ipynb
 create mode 100644 data/tabular/train_test_split.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f2a1169a9..63e6b64e0 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -6,7 +6,7 @@ ci:
 
 repos:
     - repo: https://github.com/pre-commit/pre-commit-hooks
-      rev: v4.4.0
+      rev: v4.5.0
       hooks:
           - id: check-json
           - id: check-yaml
diff --git a/data/tabular/mattermodeling_stackexchange/transform.py b/data/tabular/mattermodeling_stackexchange/transform.py
index 792da324b..1f665833b 100644
--- a/data/tabular/mattermodeling_stackexchange/transform.py
+++ b/data/tabular/mattermodeling_stackexchange/transform.py
@@ -1,5 +1,5 @@
-from datasets import load_dataset
 import pandas as pd
+from datasets import load_dataset
 
 
 def remove_repeated_almost_empty_lines(text):
@@ -31,7 +31,7 @@ def get_clean_df():
     # the answers are in an array of arrays, the first element is the answer, the second is the score
     # we then also only keep two columns, the question and the answer, both as string on which we also
     # call the strip function to remove leading and trailing whitespaces
-    for i, row in df.iterrows():
+    for _i, row in df.iterrows():
         # skip question with markdown image tag in it
         if "![" in row["question_text"]:
             continue
diff --git a/data/tabular/orbnet_denali_training/develop_transform.ipynb b/data/tabular/orbnet_denali_training/develop_transform.ipynb
deleted file mode 100644
index e69de29bb..000000000
diff --git a/data/tabular/peptides_hemolytic/meta.yaml b/data/tabular/peptides_hemolytic/meta.yaml
index 8df0e0099..1a882fdda 100644
--- a/data/tabular/peptides_hemolytic/meta.yaml
+++ b/data/tabular/peptides_hemolytic/meta.yaml
@@ -1,157 +1,146 @@
+---
 name: peptides_hemolytic
-description: "Hemolysis is referred to the disruption of erythrocyte\nmembranes that\
-  \ decrease the life span of red blood cells and causes\nthe release of Hemoglobin.\
-  \ It is critical to identify non-hemolytic\nantimicrobial peptides as a non-toxic\
-  \ and safe measure against bacterial\ninfections. However, distinguishing between\
-  \ hemolytic and non-hemolytic\npeptides is a challenge, since they primarily exert\
-  \ their activity at the\ncharged surface of the bacterial plasma membrane.\nThe\
-  \ data here comes from the Database of Antimicrobial Activity and Structure of\n\
-  Peptides (DBAASP v3). Hemolytic activity is defined by extrapolating a measurement\n\
-  assuming dose response curves to the point\nat which 50% of red blood cells are\
-  \ lysed. Activities below 100 mu g/ml, are\nconsidered hemolytic.\nThe data contains\
-  \ sequences of only L- and canonical amino acids. Each measurement\nis treated independently,\
-  \ so sequences can appear multiple times. This experimental\ndataset contains noise,\
-  \ and in some observations (40%), an identical sequence appears\nin both negative\
-  \ and positive class. As an example, sequence \"RVKRVWPLVIRTVIAGYNLYRAIKKK\"\nis\
-  \ found to be both hemolytic and\nnon-hemolytic in two different lab experiments\
-  \ (i.e. two different training examples). "
+description: "Hemolysis is referred to the disruption of erythrocyte\nmembranes that decrease the life span of red blood cells and causes\nthe release of\
+    \ Hemoglobin. It is critical to identify non-hemolytic\nantimicrobial peptides as a non-toxic and safe measure against bacterial\ninfections. However,\
+    \ distinguishing between hemolytic and non-hemolytic\npeptides is a challenge, since they primarily exert their activity at the\ncharged surface of\
+    \ the bacterial plasma membrane.\nThe data here comes from the Database of Antimicrobial Activity and Structure of\nPeptides (DBAASP v3). Hemolytic\
+    \ activity is defined by extrapolating a measurement\nassuming dose response curves to the point\nat which 50% of red blood cells are lysed. Activities\
+    \ below 100 mu g/ml, are\nconsidered hemolytic.\nThe data contains sequences of only L- and canonical amino acids. Each measurement\nis treated independently,\
+    \ so sequences can appear multiple times. This experimental\ndataset contains noise, and in some observations (40%), an identical sequence appears\n\
+    in both negative and positive class. As an example, sequence \"RVKRVWPLVIRTVIAGYNLYRAIKKK\"\nis found to be both hemolytic and\nnon-hemolytic in two\
+    \ different lab experiments (i.e. two different training examples). "
 targets:
-- id: hemolytic
-  description: The ability of a peptide sequence to lyse red blood cells (1) or not
-    (0).
-  units: null
-  type: boolean
-  names:
-  - noun: hemolytic activity
-  - noun: hemolysis
-  - verb: lyse red blood cells
-  - adjective: hemolytic
-  - gerund: lysing red blood cells
-  uris: null
+    - id: hemolytic
+      description: The ability of a peptide sequence to lyse red blood cells (1) or not (0).
+      units:
+      type: boolean
+      names:
+          - noun: hemolytic activity
+          - noun: hemolysis
+          - verb: lyse red blood cells
+          - adjective: hemolytic
+          - gerund: lysing red blood cells
+      uris:
 benchmarks: []
 identifiers:
-- id: sequence
-  type: Other
-  description: amino acid sequence
+    - id: sequence
+      type: Other
+      description: amino acid sequence
 license: CC BY 4.0
 links:
-- url: https://doi.org/10.1021/acs.jcim.2c01317
-  description: corresponding publication
-- url: https://doi.org/10.1093/nar/gkaa991
-  description: data source
+    - url: https://doi.org/10.1021/acs.jcim.2c01317
+      description: corresponding publication
+    - url: https://doi.org/10.1093/nar/gkaa991
+      description: data source
 num_points: 6541
 bibtex:
-- |-
-  @article{Martins2012,
-  doi = {10.1021/ci300124c},
-  url = {https://doi.org/10.1021/ci300124c},
-  year = {2012},
-  month = jun,
-  publisher = {American Chemical Society (ACS)},
-  volume = {52},
-  number = {6},
-  pages = {1686--1697},
-  author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro
-  and Andre O. Falcao},
-  title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling},
-  journal = {Journal of Chemical Information and Modeling}
-- |-
-  @article{Wu2018,
-  doi = {10.1039/c7sc02664a},
-  url = {https://doi.org/10.1039/c7sc02664a},
-  year = {2018},
-  publisher = {Royal Society of Chemistry (RSC)},
-  volume = {9},
-  number = {2},
-  pages = {513--530},
-  author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph
-  Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande},
-  title = {MoleculeNet: a benchmark for molecular machine learning},
-  journal = {Chemical Science}
+    - |-
+      @article{Martins2012,
+      doi = {10.1021/ci300124c},
+      url = {https://doi.org/10.1021/ci300124c},
+      year = {2012},
+      month = jun,
+      publisher = {American Chemical Society (ACS)},
+      volume = {52},
+      number = {6},
+      pages = {1686--1697},
+      author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro
+      and Andre O. Falcao},
+      title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling},
+      journal = {Journal of Chemical Information and Modeling}
+    - |-
+      @article{Wu2018,
+      doi = {10.1039/c7sc02664a},
+      url = {https://doi.org/10.1039/c7sc02664a},
+      year = {2018},
+      publisher = {Royal Society of Chemistry (RSC)},
+      volume = {9},
+      number = {2},
+      pages = {513--530},
+      author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph
+      Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande},
+      title = {MoleculeNet: a benchmark for molecular machine learning},
+      journal = {Chemical Science}
 templates:
-- The sequence of {#amino acids|AAs!} {sequence#} {#shows|exhibits|demonstrates!} {hemolytic#no
-  &NULL}{hemolytic__names__adjective} properties.
-- The amino acid sequence {sequence#} {#shows|exhibits|displays!} {hemolytic#no &NULL}{hemolytic__names__adjective}
-  properties.
-- Based on the {#amino acid sequence |sequence of amino acids !}{sequence#}, the peptide has {hemolytic#no &NULL}{hemolytic__names__adjective} {#properties|characteristics|features!}.
-- The {sequence__description} {sequence#} {#represents|is from!} a peptide
-  that is {hemolytic#not &NULL}identified as {hemolytic__names__adjective}.
-- The {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic#not
-  &NULL}{hemolytic__names__adjective}.
-- |-
-  Task: Please classify a peptide based on the description.
-  Description: A amino acid sequence that is {hemolytic__names__adjective}.
-  {#amino acid sequence |sequence of amino acids!}: {sequence#}
-  Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words.
-  Result: {hemolytic#False&True}
-- |-
-  Task: Please classify a amino acid sequence based on the description.
-  Description: A amino acid sequence that is {hemolytic__names__adjective}.
-  {#amino acid sequence |sequence of amino acids !}: {sequence#}
-  Constraint: Answer the question in a {#full|complete!} sentence.
-  Result: This amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}.
-- |-
-  Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids!} based on the {#text |!}description{# below|!}.
-  Description: A amino acid sequence that is {hemolytic__names__adjective}.
-  Result: {sequence#}
-- |-
-  User: Can you {#tell me|derive|estimate!} if the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic__names__adjective}?
-  Assistant: {hemolytic#No&Yes}, this amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}.
-- |-
-  User: Is the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} {hemolytic__names__adjective}?
-  Assistant: {hemolytic#No&Yes}, it is {hemolytic#not &NULL}{hemolytic__names__adjective}.
-- |-
-  User: Can you {#give me|create|generate!} the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}?
-  Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {sequence#}
-- |-
-  User: I'm {#searching|looking!} for the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}?
-  Assistant: This is a amino acid sequence that is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
-- |-
-  User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}.
-  Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}?
-  User: Yes, please. The amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}.
-  Assistant: {#Ok|Got it!},{# here you go,|!} this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
-- |-
-  User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}.
-  Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}?
-  User: Yes, the amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}.
-  Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
-- Is the {sequence__description} {sequence#} {hemolytic__names__adjective}:<EOI> {hemolytic#no&yes}
-- |-
-  Task: Please classify a {#amino acid sequence|sequence of amino acids|peptide!} based on the description.
-  Description: A amino acid sequence that is {hemolytic__names__adjective}.
-  {#amino acid sequence|sequence of amino acids!}: {sequence#}
-  Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words.
-  Result:<EOI> {hemolytic#False&True}
-- |-
-  Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!} based on the {#text |!}description{# below|!}.
-  Description: A {#amino acid sequence|sequence of amino acids|peptide!} that is {hemolytic__names__adjective}.
-  Result:<EOI> {sequence#}
-- |-
-  Task: Please answer the multiple choice question.
-  Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}?
-  Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words.
-  Options:
-  {hemolytic%}
-  Answer: {%multiple_choice_result}
-- |-
-  Task: Please answer the multiple choice question.
-  Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}?
-  Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words.
-  Options:
-  {hemolytic%}
-  Answer:<EOI> {%multiple_choice_result}
-- |-
-  Task: Please answer the multiple choice question.
-  Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}?
-  Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words.
-  Options:
-  {sequence%hemolytic%}
-  Answer: {%multiple_choice_result}
-- |-
-  Task: Please answer the multiple choice question.
-  Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}?
-  Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words.
-  Options:
-  {sequence%hemolytic%}
-  Answer:<EOI> {%multiple_choice_result}
+    - The sequence of {#amino acids|AAs!} {sequence#} {#shows|exhibits|demonstrates!} {hemolytic#no &NULL}{hemolytic__names__adjective} properties.
+    - The amino acid sequence {sequence#} {#shows|exhibits|displays!} {hemolytic#no &NULL}{hemolytic__names__adjective} properties.
+    - Based on the {#amino acid sequence |sequence of amino acids !}{sequence#}, the peptide has {hemolytic#no &NULL}{hemolytic__names__adjective} {#properties|characteristics|features!}.
+    - The {sequence__description} {sequence#} {#represents|is from!} a peptide that is {hemolytic#not &NULL}identified as {hemolytic__names__adjective}.
+    - The {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic#not &NULL}{hemolytic__names__adjective}.
+    - |-
+      Task: Please classify a peptide based on the description.
+      Description: A amino acid sequence that is {hemolytic__names__adjective}.
+      {#amino acid sequence |sequence of amino acids!}: {sequence#}
+      Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words.
+      Result: {hemolytic#False&True}
+    - |-
+      Task: Please classify a amino acid sequence based on the description.
+      Description: A amino acid sequence that is {hemolytic__names__adjective}.
+      {#amino acid sequence |sequence of amino acids !}: {sequence#}
+      Constraint: Answer the question in a {#full|complete!} sentence.
+      Result: This amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}.
+    - |-
+      Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids!} based on the {#text |!}description{# below|!}.
+      Description: A amino acid sequence that is {hemolytic__names__adjective}.
+      Result: {sequence#}
+    - |-
+      User: Can you {#tell me|derive|estimate!} if the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} is {hemolytic__names__adjective}?
+      Assistant: {hemolytic#No&Yes}, this amino acid sequence is {hemolytic#not &NULL}{hemolytic__names__adjective}.
+    - |-
+      User: Is the peptide with the {#amino acid sequence|sequence of amino acids!} {sequence#} {hemolytic__names__adjective}?
+      Assistant: {hemolytic#No&Yes}, it is {hemolytic#not &NULL}{hemolytic__names__adjective}.
+    - |-
+      User: Can you {#give me|create|generate!} the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}?
+      Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {sequence#}
+    - |-
+      User: I'm {#searching|looking!} for the {sequence__description} of a peptide that is {hemolytic#not &NULL}{hemolytic__names__adjective}?
+      Assistant: This is a amino acid sequence that is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
+    - |-
+      User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}.
+      Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}?
+      User: Yes, please. The amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}.
+      Assistant: {#Ok|Got it!},{# here you go,|!} this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
+    - |-
+      User: I want to {#come up with|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!}.
+      Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#amino acid sequence|one!}?
+      User: Yes, the amino acid sequence should {hemolytic#not &NULL}be {hemolytic__names__adjective}.
+      Assistant: {#Understood|Got it|Ok!}, this {sequence__description} is {hemolytic#not &NULL}{hemolytic__names__adjective}: {sequence#}
+    - Is the {sequence__description} {sequence#} {hemolytic__names__adjective}:<EOI> {hemolytic#no&yes}
+    - |-
+      Task: Please classify a {#amino acid sequence|sequence of amino acids|peptide!} based on the description.
+      Description: A amino acid sequence that is {hemolytic__names__adjective}.
+      {#amino acid sequence|sequence of amino acids!}: {sequence#}
+      Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words.
+      Result:<EOI> {hemolytic#False&True}
+    - |-
+      Task: Please {#give me|create|generate!} a {#amino acid sequence|sequence of amino acids|peptide!} based on the {#text |!}description{# below|!}.
+      Description: A {#amino acid sequence|sequence of amino acids|peptide!} that is {hemolytic__names__adjective}.
+      Result:<EOI> {sequence#}
+    - |-
+      Task: Please answer the multiple choice question.
+      Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}?
+      Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words.
+      Options:
+      {hemolytic%}
+      Answer: {%multiple_choice_result}
+    - |-
+      Task: Please answer the multiple choice question.
+      Question: Is the peptide with the {sequence__description} {#representation of |!}{sequence#} {hemolytic__names__adjective}?
+      Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words.
+      Options:
+      {hemolytic%}
+      Answer:<EOI> {%multiple_choice_result}
+    - |-
+      Task: Please answer the multiple choice question.
+      Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}?
+      Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words.
+      Options:
+      {sequence%hemolytic%}
+      Answer: {%multiple_choice_result}
+    - |-
+      Task: Please answer the multiple choice question.
+      Question: Which amino acid sequences are {hemolytic#not &NULL}{hemolytic__names__adjective}?
+      Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words.
+      Options:
+      {sequence%hemolytic%}
+      Answer:<EOI> {%multiple_choice_result}
diff --git a/data/tabular/physics_stackexchange/transform.py b/data/tabular/physics_stackexchange/transform.py
index e5a95243e..a392709a4 100644
--- a/data/tabular/physics_stackexchange/transform.py
+++ b/data/tabular/physics_stackexchange/transform.py
@@ -1,5 +1,5 @@
-from datasets import load_dataset
 import pandas as pd
+from datasets import load_dataset
 
 
 def remove_repeated_almost_empty_lines(text):
@@ -31,7 +31,7 @@ def get_clean_df():
     # the answers are in an array of arrays, the first element is the answer, the second is the score
     # we then also only keep two columns, the question and the answer, both as string on which we also
     # call the strip function to remove leading and trailing whitespaces
-    for i, row in df.iterrows():
+    for _i, row in df.iterrows():
         # skip question with markdown image tag in it
         if "![" in row["question_text"]:
             continue
diff --git a/data/tabular/train_test_split.py b/data/tabular/train_test_split.py
new file mode 100644
index 000000000..ad19faba6
--- /dev/null
+++ b/data/tabular/train_test_split.py
@@ -0,0 +1,234 @@
+"""Perform scaffold split on all datasets and rewrite data_clean.csv files.
+
+Scaffold split is a method of splitting data that ensures that the same scaffold
+is not present in both the train and test sets. This is important for evaluating
+the generalizability of a model.
+
+For more information, see:
+    - Wu, Z.; Ramsundar, B.; Feinberg, E. N.; Gomes, J.; Geniesse, C.; Pappu, A. S.;
+        Leswing, K.; Pande, V. MoleculeNet: A Benchmark for Molecular Machine Learning.
+        Chemical Science 2018, 9 (2), 513–530. https://doi.org/10.1039/c7sc02664a.
+    - Jablonka, K. M.; Rosen, A. S.; Krishnapriyan, A. S.; Smit, B.
+        An Ecosystem for Digital Reticular Chemistry. ACS Central Science 2023, 9 (4), 563–581.
+        https://doi.org/10.1021/acscentsci.2c01177.
+
+"""
+import os
+import sys
+from collections import defaultdict
+from glob import glob
+from random import Random
+from typing import Dict, List
+
+import fire
+import pandas as pd
+from rdkit import Chem, RDLogger
+from rdkit.Chem.Scaffolds import MurckoScaffold
+from tqdm import tqdm
+
+RDLogger.DisableLog("rdApp.*")
+
+
+def print_sys(s):
+    """system print
+
+    Args:
+        s (str): the string to print
+    """
+    print(s, flush=True, file=sys.stderr)
+
+
+def create_scaffold_split(
+    df: pd.DataFrame, seed: int, frac: List[float], entity: str = "SMILES"
+) -> Dict[str, pd.DataFrame]:
+    """create scaffold split. it first generates molecular scaffold for each molecule
+    and then split based on scaffolds
+    adapted from: https://github.com/mims-harvard/TDC/tdc/utils/split.py
+
+    Args:
+        df (pd.DataFrame): dataset dataframe
+        fold_seed (int): the random seed
+        frac (list): a list of train/valid/test fractions
+        entity (str): the column name for where molecule stores
+
+    Returns:
+        dict: a dictionary of splitted dataframes, where keys are train/valid/test
+        and values correspond to each dataframe
+    """
+    random = Random(seed)
+
+    s = df[entity].values
+    scaffolds = defaultdict(set)
+
+    error_smiles = 0
+    for i, smiles in tqdm(enumerate(s), total=len(s)):
+        try:
+            scaffold = MurckoScaffold.MurckoScaffoldSmiles(
+                mol=Chem.MolFromSmiles(smiles), includeChirality=False
+            )
+            scaffolds[scaffold].add(i)
+        except Exception:
+            print_sys(smiles + " returns RDKit error and is thus omitted...")
+            error_smiles += 1
+
+    train, val, test = [], [], []
+    train_size = int((len(df) - error_smiles) * frac[0])
+    val_size = int((len(df) - error_smiles) * frac[1])
+    test_size = (len(df) - error_smiles) - train_size - val_size
+    train_scaffold_count, val_scaffold_count, test_scaffold_count = 0, 0, 0
+
+    # index_sets = sorted(list(scaffolds.values()), key=lambda i: len(i), reverse=True)
+    index_sets = list(scaffolds.values())
+    big_index_sets = []
+    small_index_sets = []
+    for index_set in index_sets:
+        if len(index_set) > val_size / 2 or len(index_set) > test_size / 2:
+            big_index_sets.append(index_set)
+        else:
+            small_index_sets.append(index_set)
+    random.seed(seed)
+    random.shuffle(big_index_sets)
+    random.shuffle(small_index_sets)
+    index_sets = big_index_sets + small_index_sets
+
+    if frac[2] == 0:
+        for index_set in index_sets:
+            if len(train) + len(index_set) <= train_size:
+                train += index_set
+                train_scaffold_count += 1
+            else:
+                val += index_set
+                val_scaffold_count += 1
+    else:
+        for index_set in index_sets:
+            if len(train) + len(index_set) <= train_size:
+                train += index_set
+                train_scaffold_count += 1
+            elif len(val) + len(index_set) <= val_size:
+                val += index_set
+                val_scaffold_count += 1
+            else:
+                test += index_set
+                test_scaffold_count += 1
+
+    return {
+        "train": df.iloc[train].reset_index(drop=True),
+        "valid": df.iloc[val].reset_index(drop=True),
+        "test": df.iloc[test].reset_index(drop=True),
+    }
+
+
+def rewrite_data_with_splits(
+    csv_paths: List[str],
+    train_test_df: pd.DataFrame,
+    override: bool = False,
+    check: bool = True,
+    repr_col: str = "SMILES",
+) -> None:
+    """Rewrite dataframes with the correct split column
+
+    Args:
+        csv_paths (List[str]): list of files to merge (data_clean.csv)
+        train_test_df (pd.DataFrame): dataframe containing merged SMILES representations
+            from all datasets uniquely split into train and test
+        override (bool): whether to override the existing data_clean.csv files
+            defaults to False
+        check (bool): whether to check if the split was successful
+            defaults to True. Can be turned off to save memory
+        repr_col (str): the column name for where SMILES representation is stored
+            defaults to "SMILES"
+    """
+    if check:
+        train_smiles = set(train_test_df.query("split == 'train'")["SMILES"].to_list())
+
+    for path in csv_paths:
+        read_dataset = pd.read_csv(path)
+        if repr_col in read_dataset.columns:
+            try:
+                read_dataset = read_dataset.drop("split", axis=1)
+                message = f"Split column found in {path}."
+                if override:
+                    message += " Overriding..."
+                print(message)
+            except KeyError:
+                print(f"No split column in {path}")
+
+            col_to_merge = "SMILES"
+            merged_data = pd.merge(
+                read_dataset, train_test_df, on=col_to_merge, how="left"
+            )
+            merged_data = merged_data.dropna()
+            if override:
+                merged_data.to_csv(path, index=False)
+            else:
+                # rename the old data_clean.csv file to data_clean_old.csv
+                os.rename(path, path.replace(".csv", "_old.csv"))
+                # write the new data_clean.csv file
+                merged_data.to_csv(path, index=False)
+
+            if len(merged_data.query("split == 'train'")) == 0:
+                raise ValueError("Split failed, no train data")
+            if len(merged_data.query("split == 'test'")) == 0:
+                raise ValueError("Split failed, no test data")
+            if check:
+                test_split_smiles = set(
+                    merged_data.query("split == 'test'")["SMILES"].to_list()
+                )
+                if len(train_smiles.intersection(test_split_smiles)) > 0:
+                    raise ValueError("Split failed, train and test overlap")
+        else:
+            print(f"Skipping {path} as it does not contain {repr_col} column")
+
+
+def cli(
+    seed: int = 42,
+    train_size: float = 0.8,
+    val_size: float = 0.0,
+    test_size: float = 0.2,
+    path: str = "*/data_clean.csv",
+    override: bool = False,
+    check: bool = True,
+    repr_col: str = "SMILES",
+):
+    paths_to_data = glob(path)
+    filtered_paths = []
+    for path in paths_to_data:
+        if "flashpoint" in path:
+            filtered_paths.append(path)
+        elif "freesolv" in path:
+            filtered_paths.append(path)
+        elif "peptide" in path:
+            filtered_paths.append(path)
+    paths_to_data = filtered_paths
+
+    REPRESENTATION_LIST = []
+
+    for path in tqdm(paths_to_data):
+        df = pd.read_csv(path)
+        if repr_col in df.columns:
+            REPRESENTATION_LIST.extend(df[repr_col].to_list())
+
+    REPR_DF = pd.DataFrame()
+    REPR_DF["SMILES"] = list(set(REPRESENTATION_LIST))
+
+    scaffold_split = create_scaffold_split(
+        REPR_DF, seed=seed, frac=[train_size, val_size, test_size]
+    )
+
+    # create train and test dataframes
+    train_df = scaffold_split["train"]
+    test_df = scaffold_split["test"]
+    # add split columns to train and test dataframes
+    train_df["split"] = len(train_df) * ["train"]
+    test_df["split"] = len(test_df) * ["test"]
+
+    # merge train and test across all datasets
+    merge = pd.concat([train_df, test_df], axis=0)
+    # rewrite data_clean.csv for each dataset
+    rewrite_data_with_splits(
+        paths_to_data, merge, override=override, check=check, repr_col=repr_col
+    )
+
+
+if __name__ == "__main__":
+    fire.Fire(cli)
diff --git a/pyproject.toml b/pyproject.toml
index 6dc7cd884..b107ccd5e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ dev = [
         "flake8>=3.8.3",
         "isort>=5.0.0",
         "pre-commit",
-        "pydantic_yaml",
+        "pydantic_yaml<=0.11.2",
         "pytest",
         "pubchempy"
 ]