From 84ccd8471dcd560a0b2a7e3e9ec381b44dfaebaa Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Tue, 13 Aug 2024 09:28:19 -0700 Subject: [PATCH] update docs and add more tests --- data/tabular/bicerano_dataset/meta.yaml | 14 +- docs/CONTRIBUTING.md | 353 +++++------------------- docs/EXPERIMENT.md | 77 ------ docs/SUBMODULES.md | 73 ----- docs/api/meta_yaml_augmentor.md | 64 +++++ docs/index.md | 4 + mkdocs.yml | 7 +- src/chemnlp/data/meta_yaml_augmentor.py | 2 +- src/chemnlp/data/sampler.py | 2 +- tests/data/test_sampler.py | 182 ++++++++++++ 10 files changed, 333 insertions(+), 445 deletions(-) delete mode 100644 docs/EXPERIMENT.md delete mode 100644 docs/SUBMODULES.md create mode 100644 docs/api/meta_yaml_augmentor.md diff --git a/data/tabular/bicerano_dataset/meta.yaml b/data/tabular/bicerano_dataset/meta.yaml index 305bb1b93..907e4e8b9 100644 --- a/data/tabular/bicerano_dataset/meta.yaml +++ b/data/tabular/bicerano_dataset/meta.yaml @@ -54,12 +54,12 @@ bibtex: year = {2021}, doi = {10.1021/acsapm.0c00524}} templates: - - The polymer with the {PSMILES__description} of {PSMILES#} has an experimental glass transition temperature of {Tg_exp#} K. - - The polymer with the {PSMILES__description} of {PSMILES#} has a computed glass transition temperature of {Tg_calc#} K. - - The polymer with the {PSMILES__description} of {PSMILES#} has a computed density at 300 K of {rho_300K_calc#} g/cc. - - The polymer with the {compound_name__names__noun} of {compound_name#} has an experimental glass transition temperature of {Tg_exp#} K. - - The polymer with the {compound_name__names__noun} of {compound_name#} has a computed glass transition temperature of {Tg_calc#} K. - - The polymer with the {compound_name__names__noun} of {compound_name#} has a computed density at 300 K of {rho_300K_calc#} g/cc. + - The polymer with the {PSMILES__description} of {PSMILES#} has an experimental glass transition temperature of {Tg_exp#} {Tg_exp__units}. + - The polymer with the {PSMILES__description} of {PSMILES#} has a computed glass transition temperature of {Tg_calc#} {Tg_exp__units}. + - The polymer with the {PSMILES__description} of {PSMILES#} has a computed density at 300 K of {rho_300K_calc#} {rho_300K_calc__units}. + - The polymer with the {compound_name__names__noun} of {compound_name#} has an experimental glass transition temperature of {Tg_exp#} {Tg_exp__units}. + - The polymer with the {compound_name__names__noun} of {compound_name#} has a computed glass transition temperature of {Tg_calc#} {Tg_calc__units}. + - The polymer with the {compound_name__names__noun} of {compound_name#} has a computed density at 300 K of {rho_300K_calc#} {rho_300K_calc__units}. - |- - Question: What is a polymer with a computed glass transition temperature of {Tg_calc#} K and a computed density at 300 K of {rho_300K_calc#} g/cc. + Question: What is a polymer with a computed glass transition temperature of {Tg_calc#} {Tg_calc__units} and a computed density at 300 K of {rho_300K_calc#} {rho_300K_calc__units}. Answer: A polymer with {PSMILES__description} {PSMILES#} diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 1d4773e96..3d4fec76e 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -1,312 +1,99 @@ -There are many different ways to contribute to ChemNLP! -You can get in touch via the GitHub [task board](https://github.com/orgs/OpenBioML/projects/5?query=is:open+sort:updated-desc) and [issues](https://github.com/OpenBioML/chemnlp/issues?q=is:issue+is:open+sort:updated-desc&query=is:open+sort:updated-desc) and our [Discord](https://t.co/YMzpevmkiN). +# Contributing to ChemNLP -## Prerequisites +Thank you for your interest in contributing to ChemNLP! There are many ways to contribute, including implementing datasets, improving code, and enhancing documentation. -Please make a [GitHub account](https://github.com/) prior to implementing a dataset; you can follow instructions to install git [here](https://git-scm.com/book/en/v2/Getting-Started-Installing-Git). +## Getting Started -1. [Fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the [ChemNLP repository](https://github.com/OpenBioML/chemnlp) -2. [Clone your fork](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) -3. [Make a new branch](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) -4. Please try using [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/) for formatting your commit messages +1. Create a [GitHub account](https://github.com/) if you don't have one. +2. [Fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the [ChemNLP repository](https://github.com/OpenBioML/chemnlp). +3. [Clone your fork](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository). +4. [Create a new branch](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) for your contribution. +5. Set up your development environment as described in the `Installation and set-up` section of [README.md](README.md). -If you wish to work on one of the submodules for the project, please see the [git workflow](SUBMODULES.md) docs. +## Implementing a Dataset -## Create a development environment (For code/dataset contributions) +One of the most valuable contributions is implementing a dataset. Here's how to do it: -For code and data contributions, we recommend you creata a [conda environment](https://conda.io/projects/conda/en/latest/user-guide/tasks/manage-environments.html). If you do not have conda already installed on your system, we recommend installing [miniconda](https://docs.conda.io/en/latest/miniconda.html): +1. Choose a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) or add a new one there. +2. Create an issue in this repository stating your intention to add the dataset. +3. Make a Pull Request (PR) that adds a new folder in `data` with the following files: -To create your developer environment please follow the guidelines in the `Installation and set-up` of [README.md](README.md) + - `meta.yaml`: Describes the dataset (see structure below). + - `transform.py`: Python code to transform the original dataset into a usable form. -## Work package leads - -If you are contributing to an existing task which contains a `work package: ` label, please refer to the list below to find a main point of contact for that piece of work. If you've any questions or wish to contribute additional issues feel free to reach out to these work package leads from the core team on the [OpenBioML Discord](https://discord.gg/GgDBFP8ZEt) or message directly on GitHub issues. - -| Name (discord & github) | Main Work Packages | -| ------------------------------------------------------ | -------------------------------------------------------------- | -| Michael Pieler (MicPie#9427 & MicPie) | 💾 Structured Data, Knowledge Graph, Tokenisers, Data Sampling | -| Kevin Jablonka (Kevin Jablonka#1694 & kjappelbaum) | 💾 Structured Data, Knowledge Graph, Tokenisers, Data Sampling | -| Bethany Connolly (bethconnolly#3951 & bethanyconnolly) | 📊 Model Evaluation | -| Jack Butler (Jack Butler#8114 & jackapbutler) | ⚙️ Model Training | -| Mark Worrall (Mark Worrall#3307 & maw501) | 🦑 Model Adaptations | - -# Implementing a dataset - -## Contributing a dataset - -One of the most important ways to contribute to the ChemNLP efforts is to implement a dataset. -With "implementing" we mean the following: - -- Take a dataset from our [awesome list](https://github.com/kjappelbaum/awesome-chemistry-datasets) (if it is not there, please add it there first, so we keep track) -- Make an issue in this repository that you want to add this dataset (we will label this issue and assign it to you) -- Make a PR that adds in a new folder in `data` - - - `meta.yaml` describing the dataset in the form that `transform.py` produces. We will use this later to construct the prompts. - > If your dataset has multiple natural splits (i.e. train, test, validation) you can create a \_meta.yaml for each. - - `transform.py` Python code that transforms the original dataset (linked in `meta.yaml`) into a form that can be consumed by the loader. - For tabular datasets that will mostly involve: Removing/merging duplicated entries, renaming columns, dropping unused columns. - Try to keep the output your `transform.py` uses as lean as possible (i.e. no columns that will not be used). - In some cases, you might envision that extra columns might be useful. If this is the case, please add them (e.g., indicating some grouping, etc.) - Even though some examples create the `meta.yaml` in `transform.py` there is no need to do so. You can also do it by hand. - In most cases the data will be stored in a tabular format and should be named `data_clean.csv`. - - In the `transform.py` please try to download the data from an official resource. - We encourage you to upload the raw data to HuggingFace Hub, Foundry or some other repository and then retrieve the data from there with your script, if the raw data license permits it. - - - If you need additional dependencies, add them to `dev-requirements.txt` (those are needed for linting/testing/validation) or `requirements.txt` (those are the ones for running `transform.py`) - -The `meta.yaml` has the following structure: +### meta.yaml Structure ```yaml -name: aquasoldb # unique identifier, we will also use this for directory names -description: | # short description what this dataset is about - Curation of nine open source datasets on aqueous solubility. - The authors also assigned reliability groups. +name: dataset_name +description: Short description of the dataset targets: - - id: Solubility # name of the column in a tabular dataset - description: Experimental aqueous solubility value (LogS) # description of what this column means - units: log(mol/L) # units of the values in this column (leave empty if unitless) - type: continuous , "boolean" - names: # names for the property (to sample from for building the prompts) - - noun: aqueous solubility - - noun: solubility in water - - id: SD - description: Standard deviation of the experimental aqueous solubility value for multiple occurences - units: log(mol/L) - type: continuous + - id: target_name + description: Description of the target + units: Units of the target (if applicable) + type: continuous or boolean names: - - noun: standard deviation of the aqueous solubility - - noun: tandard deviation of the solubility in water -benchmarks: # lists all benchmarks this dataset has been part of. split_column is a column in this dataframe with the value "train", "valid", "test" - indicating to which fold a specific entry belongs to - - name: TDC - link: https://tdcommons.ai/ + - noun: target noun + - adjective: target adjective +benchmarks: + - name: benchmark_name + link: benchmark_link split_column: split identifiers: - - id: InChI # column name - type: InChI # can be "SMILES", "SELFIES", "IUPAC", "Other", "InChI", "InChiKey", "RXNSMILES", "RXNSMILESWAdd" see IdentifierEnum - description: International Chemical Identifier # description (optional, except for "OTHER") -license: CC0 1.0 # license under which the original dataset was published -num_points: 10000 # number of datapoints in this dataset -links: # list of relevant links (original dataset, other uses, etc.) - - name: dataset - url: https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/OVHAW8 - description: Original dataset -bibtex: # citation(s) for this dataset in BibTeX format - - | - "@article{Sorkun_2019, - doi = {10.1038/s41597-019-0151-1}, - url = {https://doi.org/10.1038%2Fs41597-019-0151-1}, - year = 2019, - month = {aug}, - publisher = {Springer Science and Business Media {LLC}}, - volume = {6}, - number = {1}, - author = {Murat Cihan Sorkun and Abhishek Khetan and Süleyman Er}, - title = {{AqSolDB}, a curated reference set of aqueous solubility and 2D descriptors for a diverse set of compounds}, - journal = {Sci Data} - }" -``` - -Please do not simply copy/paste generic descriptions but try to give a concise and specific description for the dataset you are adding. - -For the typical material-property datasets, we will later use the `identifier` and `property` columns to create and fill prompt templates. - -### Text templates - -With our text template setup for the sampling you can: - -- use all the data from the `meta.yaml` file, -- recode categorical data, and -- chain together multiple data fields from the tabular and meta data. - -#### Example text template 1 (mainly used for tabular data) - -``` -The molecule with the {SMILES__description} representation of {SMILES#} exhibits {mutagenic#no &NULL}{mutagenic__names__adjective} properties. + - id: identifier_name + type: SMILES, InChI, etc. + description: Description of the identifier +license: Dataset license +num_points: Number of datapoints +links: + - name: link_name + url: link_url + description: Link description +bibtex: Citation in BibTeX format ``` -- `SMILES__description` gets you the text from the description field of the SMILES identifier. The `__` dunder (double underscore) is used to indicate the levels in the `meta.yaml` file. -- `SMILES#` gets you the data of the sampled SMILES entry (= row from the tabular data). The `#` is used to get the corresponding data. -- `mutagenic#no &NULL` gets you the data with `#` and recodes it. The recoding options are separated with a `&`. In this example the binary variable `mutagenic` that can be `0` or `1` gets recoded to `no ` and `NULL`. `NULL` is a "reserved word" an indicates [no value](). Thus, the `no ` gets added in front of the `mutagenic__names__adjective` if `mutagenic# == 0`. -- `mutagenic__names__adjective` gets from the `mutagenic` target the adjective names. - -#### Example text template 2 (mainly used for KG data)` - -``` -The {node1_type#} {node1_name#|node1_smiles#} {rel1_type#} the {node2_type#} {node2_protein_names#} which {rel2_type#} the {node3_type#} {node3_name#}. -``` - -- `node1_name#|node1_smiles#` chains together two data fields from the tabular data with `|` so they are jointly sampled for this position. This means that we sample in this case from the name and the SMILES representation. -- A similar setup can be used in a single data entry (= row from the tabular data) of the tabular data: For `node2_protein_names` the field can include several protein names separated by a `|`, e.g., `Pyruvate dehydrogenase E1 component subunit beta, mitochondrial|PDHE1-B` which then samples from `Pyruvate dehydrogenase E1 component subunit beta, mitochondrial` or `PDHE1-B`. - -#### Example text templates 3 for multiple choice setups - -Multiple choice setups are also supported. For this we need three components: - -- `%multiple_choice_enum%2%aA1` can be used to list the multiple choice enumerations, i.e., `1, 2, or 3`, `A or B`, etc., The second `%` starts the multiple choice number sequence. Single integers and a range consisting of two integers separated by a `-` are supported to set the lower and higher number, e.g., `2-5` will sample a value between 2 and 5, including the boundaries, for the answer options. The third `%` is used to subselect multiple choice enumerations, i.e., `a` for lower case alphabetical enumerations, `A` for upper case alphabetical, and `1` for numerical enumerations. -- `mutagenic%` is used to list the multiple choice enumerations with the corresponding possible answer options after the multiple choice enumerations, and -- `%multiple_choice_result` is used to get the multiple choice enumeration of the answer, i.e., `1`, `c`. - Please pay attention to the `%` symbol and its position as this is used to parse the different control elements from the text template. - The sampling procedure incorporates a range of different multiple choice enumerations that are sampled randomly: -- numerical (`1, 2, 3, ...`) and alphabetical (`a, b, c, ...` or `A, B, C, ...`) enumerations combined with -- different suffixes, i.e., ` ` (no suffix), `.`, `.)`, `)`, and `:`, to create a range of different multiple choice enumerations. - If only the choices `0` or `1` are available they will be recoded with `False` and `True`. - -##### Standard template - -``` -Task: Please answer the multiple choice question below with {%multiple_choice_enum%2%aA1}. -Question: Is the molecule with the {SMILES__description} representation of {SMILES#} {mutagenic__names__adjective}? -Options: -{mutagenic%} -Answer: {%multiple_choice_result} -``` - -Example output: - -``` -Task: Please answer the multiple choice question below with A or B. -Question: Is the molecule with the SMILES representation of CC(C)NCC(O)c1ccc2ccccc2c1 Ames mutagenic? -Options: -A) False -B) True -Answer: A" -``` - -##### Template for benchmarking - -``` -Task: Please answer the multiple choice question below with {%multiple_choice_enum%2%aA1}. -Question: Is the molecule with the {SMILES__description} representation of {SMILES#} {mutagenic__names__adjective}? -Options: -{mutagenic%} -Answer:{%multiple_choice_result} -``` - -The benchmarking setup exports additional fields for the benchmarking setup, see the example below: -`{"input":"Task: Please answer the multiple choice question below with 1 or 2.\nQuestion: Is the molecule with the SMILES representation of BrCBr Ames mutagenic?\nOptions:\n1.) False\n2.) True\nAnswer:","output":" 2","output_choices":["1","2"],"correct_output_index":"1"}` -Please have a look at the following section below about the general benchmarking template setup. - -#### Example text templates 4 for flexible multiple choice setups - -More flexible multiple choice setups are also supported. The standard multiple choice setup from "Example text templates 3 for multiple choice setups" is intended for features of molecules as those are deduplicated during the sampling process. In contrast, this flexible multiple choice setup also lets you use the molecule identifiers, e.g., SMILES, in the multiple choice options. - -For this we only need to add one component to the previously outlined multiple choice format: - -- In order to let the model predict which `SMILES` has or has not the boolean variable `penetrate_BBB` we simply add `SMILES%penetrate_BBB%` as an enumeration placeholder for the possible options. With that the list of the multiple choice enumerations shows the SMILES data. Note that the `penetrate_BBB#not &NULL` is needed because the sampling is based on the individual sample (= row from the tabular data) and depending on if `penetrate_BBB` is `True` or `False` we look for a different result label because in the code we compare the sampled options to the `penetrate_BBB` value of the specific sample (= entry from the specific row from the tabular data). - -``` -Task: Please answer the multiple choice question. -Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? -Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any other words. -Options: -{SMILES%penetrate_BBB%} -Answer: {%multiple_choice_result} -``` - -``` -Task: Please answer the multiple choice question. -Question: Which molecules are not penetrating the blood brain barrier? -Constraint: You must select none, one or more options from A, B, or C without using any other words. -Options: -A. Cc1ccsc1C(=CCCN1CCC[C@@H](C(=O)O)C1)c1sccc1C -B. CC(=O)N1CCN(c2ccc(OC[C@H]3CO[C@](Cn4ccnc4)(c4ccc(Cl)cc4Cl)O3)cc2)CC1 -C. CCCC(C)C1(CC)C(=O)NC(=S)NC1=O -Answer: B, C -``` - -#### Benchmarking text templates - -There are two versions of text templates, i.e., text templates with and without the end-of-input token ``: - -``` -The {SMILES__description} {SMILES#} is {mutagenic#no &NULL}{mutagenic__names__adjective}. -Is the {SMILES__description} {SMILES#} {mutagenic__names__adjective}:{mutagenic# yes& no} -``` - -The `` token indicates the splitting position for the benchmarking export, i.e., everything before it will be written to the `input` field and everything afterwards to the `output` field. Without `` everything will be in the `text` field. -In the current setup, you can switch with the `benchmarking_templates` flag of the [`TemplateSampler` class](https://github.com/OpenBioML/chemnlp/blob/text_sampling/text_sampling/text_sampling.py#L104) between text templates with and without ``. - -The filename scheme uses the split information for the export, i.e., `train.jsonl`, `test.jsonl`, etc., and if no split information is available this will be set to `full` and exported to `full.jsonl`. With `` the filename ends with `_benchmark.jsonl` instead of `.jsonl`. - -Have a look at the [`meta.yaml` file](https://github.com/OpenBioML/chemnlp/blob/text_sampling/data/ames_mutagenicity/meta.yaml) to see the corresponding structure there. - -In case you run into issues (or think you don't have enough compute or storage), please let us know. Also, in some cases `csv` might not be the best format. If you think that `csv` is not suitable for your dataset, let us know. - -For now, you do not need to upload the transformed datasets anywhere. -We will collect the URLs of the raw data in the `meta.yaml` files and the code to produce the curated data in `transform.py` and then run in this on dedicated infrastructure. - -### How will the datasets be used? - -If your dataset is in tabular form, we will construct prompts using, for example, the LIFT framework. -In this case, we will sample from the identifier and targets columns. If you specify prompt templates, we will also sample from those. -Therefore, it is very important that the column names in the `meta.yaml` match the ones in the file that `transform.py` produces. -One example of a prompt we might construct is `"What is the of "`, where we sample `target_name` from the names of the targets listed in `meta.yaml` and `identifier` from the identifiers provided in `meta.yaml`. - -#### Splits - -If your dataset is part of a benchmark, please indicate what fold your data is part of using an additional `split_col` in which you use `train`, `valid`, `test` to indicate the split type. -Please indicate this in the `meta.yaml` under the field `split_col`. - -#### Identifiers - -We ask you to add `uris` and `pubchem_aids` in case you find suitable references. We distinguish certain types of identifiers, for which you have to specify the correct strings. The currently allowed types are in the `IdentifierEnum` in `src/chemnlp/data_val/model.py`: - -- `SMILES`: Use the canonical form ([RdKit](https://www.rdkit.org/docs/GettingStartedInPython.html)) -- `SELFIES`: [Self-referencing embedded strings](https://github.com/aspuru-guzik-group/selfies) -- `IUPAC`: IUPAC-Name, not use it for non-standard, common names -- `InChI` -- `InChIKey`: The key derived from the `InChI` -- `RXNSMILES`: The [reaction SMILES](https://www.daylight.com/meetings/summerschool98/course/dave/smiles-react.html) containing only educt and product -- `RXNSMILESWAdd`: The reaction SMILES also containing solvent and additives -- `Other`: For all other identifiers - -##### Uniform Resource Identifiers (URIs) - -If you have a uniform resource identifier (URI) that links to a suitable name of a property, please list it in the `uris` list for a given `target`. -Please ensure that the link is specific. If you have a boolean target that measures inhibition of a protein, link to `inhbitor of XY` and _not_ to the protein. -If such a link does not exist, leave the field empty. - -You might find suitable links using the following resources: - -- https://bioportal.bioontology.org/search -- https://goldbook.iupac.org/ - -#### PubChem Assay IDs - -For some targets, the activity was measured using assays. In this case, please list the assays using with their _numeric_ PubChem assay id in the field `pubchem_aids`. -Please ensure that the _first_ entry in this list is a primary scan which corresponds to the target property (and not to its inverse or a control). -Keep in mind that we plan to look up the name and the description of the assay to build prompt. That is, the name of the assay of the _first entry_ in this list should also work in a prompt such as `Is active in `?` - -#### Prompt examples - -For datasets that are not in tabular form, we are still discussing the best process, but we also envision that we might perform some named-entity-recognition to also use some of the text datasets in a framework such as LIFT. Otherwise, we will simple use them in the typical GPT pretraining task. +### transform.py Guidelines -## Implementing structured data sampler +- Download data from an official source or upload it to a repository and retrieve it from there. +- For tabular datasets: remove/merge duplicates, rename columns, and drop unused columns. +- Output should be as lean as possible, typically in a `data_clean.csv` file. +- Add any necessary dependencies to `dev-requirements.txt` or `requirements.txt`. -TBD. +## Text Templates -## Implementing tokenizers +Text templates are used for sampling and can utilize data from `meta.yaml`, recode categorical data, and chain multiple data fields. Examples include: -TBD. +1. Basic template: -## Implementing model adaptations + ``` + The molecule with {SMILES__description} {SMILES#} has {property#} {property__units}. + ``` -Our first experiments will be based on [Pythia model](https://github.com/EleutherAI/pythia) suite from [EleuetherAI](https://www.eleuther.ai) that is based on [GPT-NeoX](https://github.com/EleutherAI/gpt-neox). +2. Multiple choice template: -If you are not familiar LLM training have a look at this very good guide: [Large-scale language modeling tutorials with PyTorch from TUNiB](https://nbviewer-org.translate.goog/github/tunib-ai/large-scale-lm-tutorials/blob/main/notebooks/01_introduction.ipynb?_x_tr_sl=auto&_x_tr_tl=en&_x_tr_hl=de&_x_tr_pto=wapp) + ``` + Task: Answer the multiple choice question. + Question: Is the molecule with {SMILES__description} {SMILES#} {property__names__adjective}? + Options: {%multiple_choice_enum%2%aA1} + {property%} + Answer: {%multiple_choice_result} + ``` -Please have a look for the details in the [corresponding section in our proposal](https://docs.google.com/document/d/1C44EKSJRojm39P2CaxnEq-0FGwDRaknKxJ8lZI6xr5M/edit#heading=h.aww08l8o9tti). +3. Benchmarking template: + ``` + Is the molecule with {SMILES__description} {SMILES#} {property__names__adjective}?{property#yes&no} + ``` -## Hugging Face Hub +## Testing Your Contribution -We have a preference for using the Hugging Face Hub and processing datasets through the [`datasets`](https://github.com/huggingface/datasets) package when storing larger datasets on the [OpenBioML](https://huggingface.co/OpenBioML) hub as it can offer us a lot of nice features such as +- Ensure your code passes all existing tests. +- Add new tests for any new functionality you introduce. +- Run `pytest` to check all tests pass. -- Easy multiprocessing parallelism for data cleaning -- Version controlling of the datasets as well as our code -- Easy interface into tokenisation & other aspects for model training -- Reuse of utility functions once we have a consistent data structure. +## Submitting Your Contribution -However, don't feel pressured to use this if you're more comfortable contributing an external dataset in another format. We are primarily thinking of using this functionality for processed, combined datasets which are ready for training. +1. Commit your changes using [conventional commits](https://www.conventionalcommits.org/en/v1.0.0/). +2. Push your changes to your fork. +3. Create a Pull Request to the main ChemNLP repository. +4. Respond to any feedback on your PR. -Feel free to reach out to one of the team and read [this guide](https://huggingface.co/docs/datasets/upload_dataset#share-a-dataset-to-the-hub) for more information. +Thank you for contributing to ChemNLP! Your efforts help advance chemical natural language processing research and applications. diff --git a/docs/EXPERIMENT.md b/docs/EXPERIMENT.md deleted file mode 100644 index 5bf412733..000000000 --- a/docs/EXPERIMENT.md +++ /dev/null @@ -1,77 +0,0 @@ -# Running an experiment - -Here we describe the set-up for training a model (including on the Stability cluster). - -## General set-up - -### Configs - -- Configs are in: `experiments/configs`. -- If you wish to use a new model from Hugging Face as the starting point you will need to tokenise your data. We have an example script for `chemrxiv` which does this here: `experiments/data/prepare_hf_chemrxiv.py`. -- You will also need to create a configuration file for the model if one does not exist e.g. `experiments/configs/hugging-face/full_160M.yml`. - -If the data is already tokenised for the model you wish to use you can proceed to the next step. - -### Miniconda - -We require Miniconda to be installed when working with the training scripts to create Python environments. You can follow the bash script [here](https://github.com/OpenBioML/chemnlp/blob/main/experiments/scripts/miniconda_install.sh) to install Miniconda. - -## Interactive run - -- Create a conda environment as shown in [the documentation](https://github.com/OpenBioML/chemnlp/tree/main/experiments) and install `chemnlp`. -- If using Weights and Biases for logging: `export WANDB_BASE_URL="https://stability.wandb.io"`. -- Run using `torchrun`, for example: - ``` - torchrun --nnodes 1 --nproc-per-node 4 experiments/scripts/run_tune.py experiments/configs/hugging-face/full_160M.yml - ``` -- You can use `nvidia-smi` or `wandb` logging to monitor efficiency during this step. - -## Launching an experiment run through SLURM - -- Take the `sbatch_` script associated with the training run and execute this through an `sbatch` command as shown in [the documentation](https://github.com/OpenBioML/chemnlp/tree/main/experiments). This will build the conda environment and install `chemnlp` before the job begins. Note that building the environment can be a little slow so if you aren't confident your code will run it's best to test it interactively first. -- Example command: - -```bash -sbatch experiments/scripts/sbatch_train_hf.sh $1 $2 $3 # see script for description of arguments -sbatch experiments/scripts/sbatch_train_hf.sh experiments/maw501 maw501 160M_full.yml # explicit example -``` - -- From within the stability cluster, you can monitor your job at `/fsx/proj-chemnlp/experiments/logs` or as set in the `sbatch` script. - -## Using Weights and Biases - -If you don't have the required permission to log to W&B, please request this. In the interim you can disable this or log to a project under your name by changing the configuration options e.g. in `experiments/configs/hugging-face/full_160M.yml`. - -## Multi-node training - -This is for Hugging Face fine-tuning only at the moment and is orchestrated through the `torch.distributed` package. It allows you to expand your computing environment to multiple nodes in a distributed data parallel manner. It uses multiprocessing to efficiently parallelise training across devices. In order to enable this feature you simply have to switch to using the `*_multinode` script instead of the original slurm training script as described in the [scripts documentation](https://github.com/OpenBioML/chemnlp/tree/main/experiments). - -## Restarting from a checkpoint - -This is for Hugging Face fine-tuning only at the moment. - -**WARNING:** Hugging Face **does not** know you are restarting from a checkpoint and so you may wish to change `output_dir` in the config file to avoid overwriting old checkpoints. You may wish to use a lower learning rate / different scheduler if continuing training. - -You can restart training from a checkpoint by passing `checkpoint_path`, a directory containing the output from a model saved by HF's `Trainer` class. - -Example config block: - -```yaml -model: - base: GPTNeoXForCausalLM - name: EleutherAI/pythia-160m - revision: main - checkpoint_path: /fsx/proj-chemnlp/experiments/checkpoints/finetuned/full_160M/checkpoint-1600 # directory to restart training from -``` - -## DeepSpeed integration - -This is for Hugging Face fine-tuning only and is described in detail [here](https://huggingface.co/docs/transformers/v4.27.2/en/main_classes/deepspeed). You can enable DeepSpeed through the Hugging Face `TrainerArguments` by adding a configuration key of `deepspeed_config` followed by the name of your configuration file inside of `experiments/configs/deepspeed` configuration directory. - -Example config block: - -```yaml -trainer: - ... - deepspeed_config: deepspeed_offload_S3.json # looks in experiments/configs/deepspeed -``` diff --git a/docs/SUBMODULES.md b/docs/SUBMODULES.md deleted file mode 100644 index a5d7acb37..000000000 --- a/docs/SUBMODULES.md +++ /dev/null @@ -1,73 +0,0 @@ -# Introduction - -This page outlines the workflow for contributing to the ChemNLP project where changes to the Git submodules are required. The project currently has two submodules: - -1. [gpt-neox](https://github.com/OpenBioML/gpt-neox) -2. [lm-eval2](https://github.com/OpenBioML/lm-eval2) - -where both of these are forks from [EleutherAI](https://github.com/EleutherAI). - -# What are git submodules? - -Submodules allow us to keep seperate Git repositories as subdirectories inside ChemNLP. As these submodules are forks we can both make any changes we require to them (and pin a specific commit) as well as periodically integrate changes from the original upstream (EleutherAI) repository. - -You can think of both the `gpt-neox` and `lm-eval2` submodules as separate Git repositories with their own remotes, commit history and branches etc... - -In essence, all the ChemNLP project does is to track which commit we are using for each submodule (to see this run `git submodule status` from `chemnlp`). - -There are many excellent introductions to submodules online and we won't repeat them here. Instead we'll outline the process for working with them on the ChemNLP project and we encourage you to read more about them if of interest. Here are some links you might find useful: - -1. [7.11 Git Tools - Submodules](https://git-scm.com/book/en/v2/Git-Tools-Submodules) - section from Pro Git. -2. [Git submodule docs](https://git-scm.com/docs/git-submodule) - the documentation. - -# Getting help - -The instructions below attempt to guide you through the process of working with submodules. However, if you are still confused please reach out on GitHub or Discord to a project maintainer. - -# Workflow 1: making changes to a submodule only - -Example of making a change to the `gpt-neox` submodule for a feature called `add-peft-method`. - -1. [Fork](https://docs.github.com/en/get-started/quickstart/fork-a-repo) the [ChemNLP repository](https://github.com/OpenBioML/chemnlp) from your personal GitHub account. -2. [Clone your fork](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) and the submodules, see: [Cloning submodules](../README.md#cloning-submodules). -3. [Optional, if required for the issue] Install `chemnlp` in your virtual env using `pip install -e` (see installation instructions [here](../README.md#installation-and-set-up)). -4. [Make a new branch](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) e.g. `feat(sub):add-peft-method` in the `gpt-neox` submodule, **not** in `chemnlp`. -5. Make changes to the `gpt-neox` submodule per the issue you are working on. -6. Commit changes in the `gpt-neox` submodule. -7. Push the submodule changes to remote and open a PR in [gpt-neox](https://github.com/OpenBioML/gpt-neox). -8. Once the changes to the submodule are approved, merge them (or a reviewer will). - -The above **only** updates the `gpt-neox` submodule on remote - it **does not** change which commit `chemnlp` is tracking. To do this: - -1. On your fork of `chemnlp`, update to get the latest changes for the `gpt-neox` submodule only: `git submodule update --remote gpt-neox` -2. This will checkout the latest commit on the `main` branch of `gpt-neox`. - - Note: if you want to track a different commit of `gpt-neox` other than the latest then navigate to the `gpt-neox` directory and checkout a specific commit (e.g. your recent merge commit from the `gpt-neox` pull request above): `git checkout ` -3. In `chemnlp` [make a new branch](https://git-scm.com/book/en/v2/Git-Branching-Basic-Branching-and-Merging) e.g. `feat:update-gpt-neox-submodule` -4. Commit this change, push to your fork's remote and open a PR from your fork to the [ChemNLP repository](https://github.com/OpenBioML/chemnlp) which will update the commit the `chemnlp` project tracks. - -Things to note: - -- The remote of `chemnlp` should be your fork. -- The remote of `gpt-neox` should be the [OpenBioML fork](https://github.com/OpenBioML/gpt-neox). - -To see the remotes for a Git repository run: `git remote -v` - -# Workflow 2: making changes to both ChemNLP and a submodule - -If you need to make changes to the main `chemnlp` project at the same time as a submodule the above workflow can be modified to accomodate this. It's advisable to make changes to the submodule first then once these are merged, submit a PR to the [ChemNLP repository](https://github.com/OpenBioML/chemnlp) which (i) adds changes to `chemnlp` and (ii) updates the `gpt-neox` commit which `chemnlp` tracks. - -# Appendix - -## Detached HEADs & submodules - -Usually, when working with Git, you have a certain *branch* checked out. However, Git also allows you to check out any arbitrary commit. Working in such a non-branch scenario is called having a "detached HEAD". - -With submodules: using the `update` command (e.g. `git submodule update`) on a submodule *checks out a specific commit - not a branch*. This means that the submodule repository will be in a "detached HEAD" state. - -🚨 **Don't commit on a detached HEAD** 🚨 - -When you work in the submodule directly you should create or checkout a branch before committing your work. - -See also: [why did Git detach my HEAD?](https://stackoverflow.com/questions/3965676/why-did-my-git-repo-enter-a-detached-head-state/3965714#3965714) - -> Any checkout of a commit that is not the name of one of *your* branches will get you a detached HEAD. A SHA1 which represents the tip of a branch still gives a detached HEAD. Only a checkout of a local branch *name* avoids that mode. diff --git a/docs/api/meta_yaml_augmentor.md b/docs/api/meta_yaml_augmentor.md new file mode 100644 index 000000000..0866ac315 --- /dev/null +++ b/docs/api/meta_yaml_augmentor.md @@ -0,0 +1,64 @@ +# Meta YAML Augmenter + +## Overview + +The Meta YAML Augmenter is a tool designed to enhance existing `meta.yaml` files for chemical datasets. It uses Large Language Models (LLMs) to generate additional templates and improve the metadata structure, particularly focusing on advanced sampling methods and template formats. + +## generate_augmented_meta_yaml + +::: chemnlp.data.meta_yaml_augmenter.generate_augmented_meta_yaml +handler: python +options: +show_root_heading: true +show_source: false + +## CLI Interface + +The module provides a command-line interface for easy augmentation of `meta.yaml` files. + +### Usage + +```bash +python -m chemnlp.data.meta_yaml_augmenter [--model MODEL] [--override] +``` + +### Arguments + +- `data_dir` (str): Path to the directory containing the `meta.yaml` file to be augmented. +- `--model` (str, optional): The name of the LLM model to use for augmentation. Default is 'gpt-4o'. +- `--override` (flag): If set, the existing `meta.yaml` file will be overwritten with the augmented version. + +### Example + +```bash +python -m chemnlp.data.meta_yaml_augmenter /path/to/dataset --model gpt-4o --override +``` + +## Augmentation Process + +The augmentation process involves: + +1. Reading the existing `meta.yaml` file from the specified directory. +2. Sending the content to an LLM along with guidelines for creating advanced templates. +3. Parsing the LLM's response to generate an augmented `meta.yaml` structure. +4. Either printing the augmented structure or overwriting the existing file, based on the `override` flag. + +## Notes + +1. **LLM Integration**: This tool requires integration with an LLM service. Ensure you have the necessary credentials and access set up. By default it uses, `gpt-4o`. For this, you need to expose the `OPENAI_API_KEY` environment variable. + +2. **Output Quality**: The quality of the augmented `meta.yaml` depends on the capabilities of the LLM being used. Manual review and adjustment may be necessary. + +## Example Usage in Python + +```python +from chemnlp.data.meta_yaml_augmenter import generate_augmented_meta_yaml + +data_dir = "/path/to/dataset" +model_name = "gpt-4o" + +augmented_yaml = generate_augmented_meta_yaml(data_dir, model_name) + +if augmented_yaml: + print(yaml.dump(augmented_yaml)) +``` diff --git a/docs/index.md b/docs/index.md index e69de29bb..d87f0b93b 100644 --- a/docs/index.md +++ b/docs/index.md @@ -0,0 +1,4 @@ +# ChemNLP + +ChemNLP is an effort to create the largest dataset of chemical data. +We then use this dataset to train large language models (LLMs). diff --git a/mkdocs.yml b/mkdocs.yml index 2ad61efcd..3bdf90b63 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,8 +1,6 @@ site_name: ChemNLP Documentation theme: name: material - palette: - primary: teal nav: - Home: index.md - User Guide: @@ -10,10 +8,13 @@ nav: - Quick Start: user-guide/quickstart.md - API Reference: - Sampler Module: api/sampler.md + - Sampler CLI: api/sampler_cli.md + - Meta YAML Generator: api/meta_yaml_generator.md + - Meta YAML Augmentor: api/meta_yaml_augmentor.md - Examples: - Basic Usage: examples/basic-usage.md - Advanced Techniques: examples/advanced-techniques.md - - Contributing: contributing.md + - Contributing: CONTRIBUTING.md - Changelog: changelog.md markdown_extensions: - pymdownx.highlight diff --git a/src/chemnlp/data/meta_yaml_augmentor.py b/src/chemnlp/data/meta_yaml_augmentor.py index 5e2b6abce..f5bc86f6c 100644 --- a/src/chemnlp/data/meta_yaml_augmentor.py +++ b/src/chemnlp/data/meta_yaml_augmentor.py @@ -23,7 +23,7 @@ `Is the {SMILES__description} {SMILES#} a {CYP2D6_Substrate__names__noun}:{CYP2D6_Substrate#no&yes}` 3. Conditional Statements: -- Use {COLUMN#not &NULL} for conditional text based on column values. +- Use {COLUMN#not &NULL} for conditional text based on column values. Note that this only makes sense for columns that are boolean. 4. Random Choices: - Use {#option1|option2|option3!} for random selection of text. diff --git a/src/chemnlp/data/sampler.py b/src/chemnlp/data/sampler.py index eff28604e..9a7cf5045 100644 --- a/src/chemnlp/data/sampler.py +++ b/src/chemnlp/data/sampler.py @@ -61,7 +61,6 @@ def __init__( def _wrap_identifier(self, identifier: str, value: str) -> str: """Wrap the identifier value with tags if wrap_identifiers is enabled.""" - print("wrap_identifier", identifier, value, self.wrap_identifiers) if not self.wrap_identifiers: return value @@ -164,6 +163,7 @@ def _get_target_from_row(self, sample: pd.Series, var: str) -> str: elif ("#" in var) and ("&" in var): var, choices = var.split("#") choices = choices.split("&") + print("var and choices and sample", var, choices, sample) choice = choices[sample[var]] return "" if choice == "NULL" else choice diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py index a6e9c4f4a..3c806a4ac 100644 --- a/tests/data/test_sampler.py +++ b/tests/data/test_sampler.py @@ -92,6 +92,76 @@ def large_sample_df(): ) +@pytest.fixture +def sample_polymer_df(): + return pd.DataFrame( + { + "PSMILES": ["*CC(*)C", "*CC(C)C*", "*C(CC)CCC*"], + "compound_name": [ + "Poly(propylene)", + "Poly(isobutylene)", + "Poly(pentylene)", + ], + "Tg_exp": [273.15, 200.0, 250.0], + "Tg_calc": [275.0, 205.0, 245.0], + "rho_300K_calc": [0.90, 0.92, 0.88], + "split": ["train", "test", "validation"], + } + ) + + +@pytest.fixture +def sample_polymer_meta(): + return { + "identifiers": [ + { + "id": "PSMILES", + "type": "PSMILES", + "description": "PSMILES representation", + }, + { + "id": "compound_name", + "type": "Other", + "description": "polymer name", + "names": [{"noun": "compound name"}, {"noun": "polymer name"}], + }, + ], + "targets": [ + { + "id": "Tg_exp", + "type": "continuous", + "description": "Experimental glass transition temperature", + "units": "K", + "names": [{"noun": "experimental glass transition temperature"}], + }, + { + "id": "Tg_calc", + "type": "continuous", + "description": "Computed glass transition temperature", + "units": "K", + "names": [{"noun": "computed glass transition temperature"}], + }, + { + "id": "rho_300K_calc", + "type": "continuous", + "description": "Computed density at 300K", + "units": "g/cm³", + "names": [{"noun": "computed density at 300 K"}], + }, + ], + } + + +@pytest.fixture +def sample_polymer_config(): + return { + "DEFAULT_SIGNIFICANT_DIGITS": 2, + "multiple_choice_rnd_symbols": ["", ".)", ")"], + "multiple_choice_benchmarking_templates": False, + "multiple_choice_benchmarking_format": None, + } + + @pytest.fixture def large_sample_meta(sample_meta): sample_meta["targets"].append( @@ -293,3 +363,115 @@ def test_wrapping_with_continuous_value( result = sampler.sample(large_sample_df.iloc[0], template) assert "[BEGIN_SMILES]" in result and "[END_SMILES]" in result assert re.search(r"LogP: \d+\.\d{2}", result) # Checks for 2 decimal places + + +def test_polymer_template_1( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = "The polymer with the {PSMILES__description} of {PSMILES#} has an experimental glass transition temperature of {Tg_exp#} {Tg_exp__units}." + result = sampler.sample(sample_polymer_df.iloc[0], template) + assert "PSMILES representation" in result + assert "*CC(*)C" in result + assert "273.15" in result + assert "K" in result + + +def test_polymer_template_2( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = "The polymer with the {compound_name__names__noun} of {compound_name#} has a computed density at 300 K of {rho_300K_calc#} {rho_300K_calc__units}." + result = sampler.sample(sample_polymer_df.iloc[1], template) + assert "polymer name" in result or "compound name" in result + assert "Poly(isobutylene)" in result + assert "0.92" in result + assert "g/cm³" in result + + +def test_polymer_question_answer( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = """Question: What is a polymer with a computed glass transition temperature of {Tg_calc#} {Tg_calc__units} and a computed density at 300 K of {rho_300K_calc#} {rho_300K_calc__units}. + +Answer: A polymer with {PSMILES__description} {PSMILES#}""" + result = sampler.sample(sample_polymer_df.iloc[0], template) + assert "275.0" in result + assert "0.90" in result + assert "PSMILES representation" in result + assert "*CC(*)C" in result + + +def test_polymer_multiple_choice( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = """Task: Please answer the multiple choice question. + +Question: Which polymer has an experimental glass transition temperature of {Tg_exp#} {Tg_exp__units}? + +Options: + +{%multiple_choice_enum%3%aA1} + +{compound_name%} + +Answer: {%multiple_choice_result}""" + result = sampler.sample(sample_polymer_df.iloc[0], template) + assert "273.15" in result + assert "K" in result + assert any( + symbol in result for symbol in ["A", "B", "C", "a", "b", "c", "1", "2", "3"] + ) + + # check that the answer is the correct polymer name, i.e. Poly(propylene) + last_line_enum = result.split("\n")[-1].replace("Answer: ", "").strip() + + # find the option with that enum + for line in result.split("\n"): + if line.startswith(last_line_enum): + # if any polymer name is in the line, we run the assert + if any( + polymer_name in line + for polymer_name in [ + "Poly(propylene)", + "Poly(ethylene)", + "Poly(propylene-alt-ethylene)", + ] + ): + assert "Poly(propylene)" in line + + +def test_polymer_property_comparison( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = "The polymer {compound_name#} has an experimental Tg of {Tg_exp#} K and a computed Tg of {Tg_calc#} K." + result = sampler.sample(sample_polymer_df.iloc[0], template) + assert "Poly(propylene)" in result + assert "273.15" in result + assert "275.0" in result + + +def test_polymer_multiple_properties( + sample_polymer_df, sample_polymer_meta, sample_polymer_config +): + sampler = TemplateSampler( + sample_polymer_df, sample_polymer_meta, sample_polymer_config + ) + template = "The polymer with PSMILES {PSMILES#} has a computed Tg of {Tg_calc#} K and a computed density at 300 K of {rho_300K_calc#} g/cm³." + result = sampler.sample(sample_polymer_df.iloc[0], template) + assert "*CC(*)C" in result + assert "275.0" in result + assert "0.90" in result