diff --git a/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml b/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml deleted file mode 100644 index db2f840c9..000000000 --- a/data/tabular/blood_brain_barrier_martins_et_al/meta.yaml +++ /dev/null @@ -1,154 +0,0 @@ -name: blood_brain_barrier_martins_et_al -description: |- - As a membrane separating circulating blood and brain extracellular - fluid, the blood-brain barrier (BBB) is the protection layer that blocks most - foreign drugs. Thus the ability of a drug to penetrate the barrier to deliver - to the site of action forms a crucial challenge in development of drugs for the - central nervous system. -targets: - - id: penetrate_BBB - description: The ability of a drug to penetrate the blood brain barrier (1) or not (0) - units: - type: boolean - names: - - noun: blood brain barrier penetration - - noun: ADME blood-brain barrier penetration - - verb: penetrates the blood brain barrier to reach the brain - - verb: penetrates the blood brain barrier - - adjective: penetrating the blood brain barrier - - adjective: penetrating the blood brain barrier to reach the brain - uris: -benchmarks: - - name: TDC - link: https://tdcommons.ai/ - split_column: split -identifiers: - - id: SMILES - type: SMILES - description: SMILES - - id: compound_name - type: Other - names: - - noun: compound name - - noun: drug name - - noun: generic drug name - description: compound name -license: CC BY 4.0 -links: - - url: https://doi.org/10.1021/ci300124c - description: corresponding publication - - url: https://rb.gy/0xx91v - description: corresponding publication - - url: https://tdcommons.ai/single_pred_tasks/adme/#bbb-blood-brain-barrier-martins-et-al - description: data source -num_points: 2030 -bibtex: - - |- - @article{Martins2012, - doi = {10.1021/ci300124c}, - url = {https://doi.org/10.1021/ci300124c}, - year = {2012}, - month = jun, - publisher = {American Chemical Society (ACS)}, - volume = {52}, - number = {6}, - pages = {1686--1697}, - author = {Ines Filipa Martins and Ana L. Teixeira and Luis Pinheiro - and Andre O. Falcao}, - title = {A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling}, - journal = {Journal of Chemical Information and Modeling} - - |- - @article{Wu2018, - doi = {10.1039/c7sc02664a}, - url = {https://doi.org/10.1039/c7sc02664a}, - year = {2018}, - publisher = {Royal Society of Chemistry (RSC)}, - volume = {9}, - number = {2}, - pages = {513--530}, - author = {Zhenqin Wu and Bharath Ramsundar and Evan~N. Feinberg and Joseph - Gomes and Caleb Geniesse and Aneesh S. Pappu and Karl Leswing and Vijay Pande}, - title = {MoleculeNet: a benchmark for molecular machine learning}, - journal = {Chemical Science} -templates: - - The molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - Based on the {SMILES__description} {#representation |!}{SMILES#}, the molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - The {SMILES__description} {SMILES#} {#represents|is from!} a molecule that is {penetrate_BBB#not &NULL}identified as {penetrate_BBB__names__adjective}. - - The molecule represented with the {SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - "{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}identified as {penetrate_BBB__names__adjective}." - - "{SMILES#} represents a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}." - - "{SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}." - - The {#molecule |!}{SMILES__description} {SMILES#} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - |- - Task: Please classify a molecule based on the description. - Description: A molecule that is {penetrate_BBB__names__adjective}. - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result: {penetrate_BBB#False&True} - - |- - Task: Please classify a molecule based on the description. - Description: A molecule that is {penetrate_BBB__names__adjective}. - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Answer the question in a {#full|complete!} sentence. - Result: This molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - |- - Task: Please {#give me|create|generate!} the {SMILES__description} of a {#molecule|chemical|chemical compound!} based on the {#text |!}description{# below|!}. - Description: A molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - Result: {SMILES#} - - |- - User: Can you {#tell me|derive|estimate!} if the molecule with the {SMILES__description} {SMILES#} is {penetrate_BBB__names__adjective}? - Assistant: {penetrate_BBB#No&Yes}, this molecule is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - |- - User: Is the molecule with the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}? - Assistant: {penetrate_BBB#No&Yes}, it is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - - |- - User: Can you {#give me|create|generate!} the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? - Assistant: {#Yes|Of course|Sure|Yes, I'm happy to help!}, here you go: {SMILES#} - - |- - User: I'm {#searching|looking!} for the {SMILES__description} of a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? - Assistant: This is a molecule that is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} the {SMILES__description} of a {#molecule|chemical!}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should I consider any {#constraints|specific points!} for the {#generation|creation!}? - User: Yes, please. The molecule should {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}. - Assistant: {#Ok|Got it!},{# here you go,|!} this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} - - |- - User: I want to {#come up with|create|generate!} a {#molecule |!}{SMILES__description}. - Assistant: {#This sounds very exciting. |This sounds very interesting. !}Should it be a special {#molecule|one!}? - User: Yes, the molecule should {penetrate_BBB#not &NULL}be {penetrate_BBB__names__adjective}. - Assistant: Got it, this {SMILES__description} is {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}: {SMILES#} - - Is the {SMILES__description} {SMILES#} {penetrate_BBB__names__adjective}:{penetrate_BBB#no&yes} - - |- - Task: Please classify a molecule based on the description. - Description: A molecule that is {penetrate_BBB__names__adjective}. - {#Molecule |!}{SMILES__description}: {SMILES#} - Constraint: Even if you are {#uncertain|not sure!}, you must pick either "True" or "False" without using any {#other|additional!} words. - Result:{penetrate_BBB#False&True} - - |- - Task: Please answer the multiple choice question. - Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {penetrate_BBB%} - Answer: {%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {SMILES%penetrate_BBB%} - Answer: {%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Is the molecule with the {SMILES__description} of {SMILES#} {penetrate_BBB__names__adjective}? - Constraint: Even if you are {#uncertain|not sure!}, you must pick either {%multiple_choice_enum%2%aA1} without using any {#other|additional!} words. - Options: - {penetrate_BBB%} - Answer:{%multiple_choice_result} - - |- - Task: Please answer the multiple choice question. - Question: Which molecules are {penetrate_BBB#not &NULL}{penetrate_BBB__names__adjective}? - Constraint: You must select none, one or more options from {%multiple_choice_enum%2-5%aA1} without using any {#other|additional!} words. - Options: - {SMILES%penetrate_BBB%} - Answer:{%multiple_choice_result} diff --git a/docs/api/meta_yaml_generator.md b/docs/api/meta_yaml_generator.md new file mode 100644 index 000000000..09c3f651e --- /dev/null +++ b/docs/api/meta_yaml_generator.md @@ -0,0 +1,42 @@ +# Meta YAML Generator + +## Overview + +The Meta YAML Generator is a tool designed to automatically create a `meta.yaml` file for chemical datasets using Large Language Models (LLMs). It analyzes the structure of a given DataFrame and generates a comprehensive metadata file, including advanced sampling methods and template formats. + +The model used by default is `gpt4o`. For using it, you need to expose the `OPENAI_API_KEY` environment variable. + +## `generate_meta_yaml` + +::: chemnlp.data.meta_yaml_generator.generate_meta_yaml +handler: python +options: +show_root_heading: true +show_source: false + +## Usage Example + +```python +import pandas as pd +from chemnlp.data.meta_yaml_generator import generate_meta_yaml + +# Load your dataset +df = pd.read_csv("your_dataset.csv") + +# Generate meta.yaml +meta_yaml = generate_meta_yaml( + df, + dataset_name="Polymer Properties Dataset", + description="A dataset of polymer properties including glass transition temperatures and densities", + output_path="path/to/save/meta.yaml" +) + +# The meta_yaml variable now contains the dictionary representation of the meta.yaml +# If an output_path was provided, the meta.yaml file has been saved to that location +``` + +You can also use it as a command-line tool: + +```bash +python -m chemnlp.data.meta_yaml_generator path/to/your_dataset.csv --dataset_name "Polymer Properties Dataset" --description "A dataset of polymer properties including glass transition temperatures and densities" --output_path "path/to/save/meta.yaml" +``` diff --git a/src/chemnlp/data/helper.py b/src/chemnlp/data/helper.py index 8f4637f49..730344c40 100644 --- a/src/chemnlp/data/helper.py +++ b/src/chemnlp/data/helper.py @@ -2,6 +2,7 @@ import yaml from typing import Dict, Any from litellm import completion +import fire CONSTANT_PROMPT = """ @@ -100,7 +101,10 @@ 4. Random Choices: - Use {#option1|option2|option3!} for random selection of text. -Generate a similar meta.yaml structure for the given dataset, including appropriate targets, identifiers, and templates based on the column names and example data provided. Include at least one multiple choice template and one benchmarking template.""" +Generate a similar meta.yaml structure for the given dataset, including appropriate targets, identifiers, and templates based on the column names and example data provided. Include at least one multiple choice template and one benchmarking template. + +Just return raw YAML string, no need to wrap it into backticks or anything else. +""" def generate_meta_yaml( @@ -140,9 +144,11 @@ def generate_meta_yaml( # Call the LLM with the prompt llm_response = completion( - model=model, messages=[{"role": "user", "content": prompt}] + model=model, messages=[{"role": "user", "content": prompt}], temperature=0 ) + llm_response = llm_response.choices[0].message.content + # Parse the LLM's response and convert it to a dictionary try: meta_yaml = yaml.safe_load(llm_response) @@ -153,24 +159,41 @@ def generate_meta_yaml( return meta_yaml -# Example usage -if __name__ == "__main__": - # Load your DataFrame - df = pd.read_csv("your_dataset.csv") - - # Generate meta.yaml - meta_yaml = generate_meta_yaml( - df, - dataset_name="Your Dataset Name", - description="A brief description of your dataset", - ) +def cli( + data_path: str, + dataset_name: str, + description: str, + model: str = "gpt-4o", + output_path: str = None, +): + """ + Generate a meta.yaml structure for a dataset using an LLM based on a CSV file. + Args: + data_path (str): The path to the CSV file containing the dataset. + dataset_name (str): The name of the dataset. + description (str): A brief description of the dataset. + model (str, optional): The LLM model to use. Defaults to 'gpt-4o'. + output_path (str, optional): The path to save the generated meta.yaml. Defaults to None. + """ + # Load the dataset from the CSV file + df = pd.read_csv(data_path) + + # Generate the meta.yaml structure + meta_yaml = generate_meta_yaml(df, dataset_name, description, model) + + output_path = output_path or "meta.yaml" # Print or save the generated meta.yaml if meta_yaml: print(yaml.dump(meta_yaml, default_flow_style=False)) # Optionally, save to a file - with open("meta.yaml", "w") as f: + with open(output_path, "w") as f: yaml.dump(meta_yaml, f, default_flow_style=False) else: print("Failed to generate meta.yaml") + + +# Example usage +if __name__ == "__main__": + fire.Fire(cli) diff --git a/src/chemnlp/data/meta.yaml b/src/chemnlp/data/meta.yaml new file mode 100644 index 000000000..6de661876 --- /dev/null +++ b/src/chemnlp/data/meta.yaml @@ -0,0 +1,31 @@ +bibtex: + - "@article{martins2023,\nauthor = {Martins, John and Doe, Jane and Smith, Alice},\ntitle = {Study on Blood-Brain Barrier Penetration of Various Drugs},\njournal = {Journal of Pharmacology},\nvolume = {12},\nnumber = {3},\npages = {123-134},\nyear = {2023},\ndoi = {10.1234/jpharm.2023.56789}}" +description: Describing the ability of different drugs to penetrate the blood-brain barrier. +identifiers: + - description: Simplified Molecular Input Line Entry System + id: SMILES + type: SMILES + - description: Name of the compound + id: compound_name + names: + - noun: compound name + type: Other +license: CC BY 4.0 +links: + - description: corresponding publication + url: https://example.com/publication + - description: data source + url: https://example.com/data_source +name: blood_brain_barrier_martins_et_al +num_points: 2030 +targets: + - description: Indicates whether the compound can penetrate the blood-brain barrier (1 for yes, 0 for no) + id: penetrate_BBB + names: + - noun: blood-brain barrier penetration + type: integer +templates: + - The compound {compound_name__names__noun} with SMILES {SMILES#} can {#penetrate|not penetrate!} the blood-brain barrier. + - The compound {compound_name__names__noun} with SMILES {SMILES#} is in the {split#} set. + - "Question: Which of the following compounds can penetrate the blood-brain barrier?\nOptions: {%multiple_choice_enum%4%aA1}\n{compound_name%}\nAnswer: {%multiple_choice_result}" + - The compound with SMILES {SMILES#} can penetrate the blood-brain barrier:{penetrate_BBB#}