diff --git a/data/tabular/melting_points/meta.yaml b/data/tabular/melting_points/meta.yaml index 531cdd75f..1f101b454 100644 --- a/data/tabular/melting_points/meta.yaml +++ b/data/tabular/melting_points/meta.yaml @@ -7,7 +7,7 @@ targets: units: deg C type: continuous names: - - noun: mean melting point + - noun: melting point uris: - id: mp_range description: melting point range @@ -45,45 +45,69 @@ bibtex: } templates: - |- - {#Task: |Task: |!}{#Predict|Estimate!} the melting point of {NAME#}. - {#Answer: |A: |!}The melting point is {mp#} deg C. + {#Task: |Task: |!}{#Predict|Estimate!} the {mp__names__noun} of {NAME#}. + {#Answer: |A: |!}The {mp__names__noun} is {mp#} {mp__units}. - |- - {#Task: |Task: |!}{#Predict|Estimate!} the melting point of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? - {#Answer: |A: |!}{#The melting point is |!}{mp#} deg C. + {#Task: |Task: |!}{#Predict|Estimate!} the {mp__names__noun} of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? + {#Answer: |A: |!}{#The {mp__names__noun} is |!}{mp#} {mp__units}. - |- - {#Question: |Q: !}What is the melting point of {NAME#}? - {#Answer: |A: |!}{#The melting point is |!}{mp#} deg C. + {#Question: |Q: !}What is the {mp__names__noun} of {NAME#}? + {#Answer: |A: |!}{#The melting point is |!}{mp#} {mp__units}. - |- - {#Question: |Q: !}What is the melting point of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? - {#Answer: |A: |!}{#The melting point is |!}{mp#} deg C. + {#Question: |Q: !}What is the {mp__names__noun} of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? + {#Answer: |A: |!}{#The melting point is |!}{mp#} {mp__units}. - |- - {#Question: |Q: !}What is the melting point of {NAME#}? - {#Answer: |A: |!}{#The melting point is in the range |!}{mp_range#} deg C. + {#Question: |Q: !}What is the {mp__names__noun} of {NAME#}? + {#Answer: |A: |!}{#The melting point is in the range |!}{mp_range#} {mp__units}. - |- - {#Question: |Q: !}What is the melting point of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? - {#Answer: |A: |!}{#The melting point is in the range |!}{mp_range#} deg C. + {#Question: |Q: !}What is the {mp__names__noun} of a {#molecule|compound!} with the {SMILES__description} {SMILES#}? + {#Answer: |A: |!}{#The melting point is in the range |!}{mp_range#} {mp__units}. - |- - {#Question: |Q: !}What is a compound with a melting point of {mp#} deg C? + {#Question: |Q: !}What is a compound with a {mp__names__noun} of {mp#} {mp__units}? {#Answer: |A: |!}{NAME#} - |- - {#Question: |Q: !}What is a compound with a melting point in the range {mp_range#} deg C? + {#Question: |Q: !}What is a compound with a {mp__names__noun} in the range {mp_range#} {mp__units}? {#Answer: |A: |!}{NAME#} - |- User: I have a question about {NAME#}. Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} - User: What is the melting point of {#this compound|this molecule!}? - Assistant: {#The melting point is |!}{mp#} deg C. + User: What is the {mp__names__noun} of {#this compound|this molecule!}? + Assistant: {#The melting point is |!}{mp#} {mp__units}. - |- User: I have a question about a {#compound|molecule!} with the {SMILES__description} {SMILES#}. Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} - User: What is the melting point of {#this compound|this molecule!}? - Assistant: {#The melting point is |!}{mp#} deg C. {#Is there anything else I can help you with?|Do you have any other questions?|Do you have any other questions for me?|Is there anything else I can help you with today?|Do you have any other questions for me today?!} + User: What is the {mp__names__noun} of {#this compound|this molecule!}? + Assistant: {#The melting point is |!}{mp#} {mp__units}. {#Is there anything else I can help you with?|Do you have any other questions?|Do you have any other questions for me?|Is there anything else I can help you with today?|Do you have any other questions for me today?!} User: {#Yes,|Indeed,!} what is the name of {#this compound|this molecule!}? Assistant: {NAME#} - |- User: I have a question about {NAME#}. Assistant: {#Sure, what is your question?|How can I help?|That sounds interesting, how can I help?|Interesting, how can I help?!} - User: What is the melting point of {#this compound|this molecule!}? - Assistant: {#The melting point is |!}{mp_range#} deg C. {#Is there anything else I can help you with?|Do you have any other questions?|Do you have any other questions for me?|Is there anything else I can help you with today?|Do you have any other questions for me today?!} + User: What is the {mp__names__noun} of {#this compound|this molecule!}? + Assistant: {#The melting point is |!}{mp_range#} {mp__units}. {#Is there anything else I can help you with?|Do you have any other questions?|Do you have any other questions for me?|Is there anything else I can help you with today?|Do you have any other questions for me today?!} User: {#Yes,|Indeed,!} what is the {SMILES__description} of {#this compound|this molecule!}? Assistant: {SMILES#} + - |- + Task: Please estimate the {mp_names__noun} of a compound. + + Compound: {NAME#} + + Result: {mp#} {mp__units} + - |- + Task: Please estimate the {mp_names__noun} of a compound. + + {SMILES__description}: {SMILES#} + + Result: {mp#} {mp__units} + - |- + Question: What is the {mp_names__noun} of a compound with the {SMILES__description} {SMILES#} in {mp__units}? + + Answer: {mp#} + - |- + Question: Which molecule has a {mp_names__noun} of {mp#} {mp__units}? + Pick {%multiple_choice_enum%3%aA1}. + + Options: + {SMILES%} + + Answer: {%multiple_choice_result} diff --git a/src/chemnlp/data/meta_yaml_augmentor.py b/src/chemnlp/data/meta_yaml_augmentor.py index ee7502f89..59a93b434 100644 --- a/src/chemnlp/data/meta_yaml_augmentor.py +++ b/src/chemnlp/data/meta_yaml_augmentor.py @@ -76,6 +76,8 @@ This is a multi-line string. It can contain multiple lines. +Stick to the syntax of the templating language that is described above. Do not use any other syntax and do not remove the special characters. + Just return raw YAML string, do not wrap it into backticks or anything else. """ diff --git a/src/chemnlp/data/sampler.py b/src/chemnlp/data/sampler.py index 5c563c5dc..6f7bef0ca 100644 --- a/src/chemnlp/data/sampler.py +++ b/src/chemnlp/data/sampler.py @@ -71,10 +71,22 @@ def _get_additional_targets(self, df: pd.DataFrame) -> List[str]: def _add_additional_targets_to_meta(self): additional_targets_meta = { "selfies": {"id": "selfies", "type": "selfies", "description": "SELFIES"}, - "deepsmiles": {"id": "deepsmiles", "type": "deepsmiles", "description": "DeepSMILES"}, - "canonical": {"id": "canonical", "type": "canonical", "description": "canonical SMILES"}, + "deepsmiles": { + "id": "deepsmiles", + "type": "deepsmiles", + "description": "DeepSMILES", + }, + "canonical": { + "id": "canonical", + "type": "canonical", + "description": "canonical SMILES", + }, "inchi": {"id": "inchi", "type": "inchi", "description": "InChI"}, - "iupac_name": {"id": "iupac_name", "type": "iupac_name", "description": "IUPAC name"}, + "iupac_name": { + "id": "iupac_name", + "type": "iupac_name", + "description": "IUPAC name", + }, } for target in self.additional_targets: self.meta["targets"].append(additional_targets_meta[target]) @@ -538,7 +550,11 @@ def sample(self, sample: pd.Series, template: str) -> str: if sample is None: sample = self.df.sample(1).iloc[0] if self.additional_targets and "SMILES" in sample.index: - non_nan_targets = [target for target in ["SMILES"] + self.additional_targets if pd.notna(sample[target])] + non_nan_targets = [ + target + for target in ["SMILES"] + self.additional_targets + if pd.notna(sample[target]) + ] new_target = random.choice(non_nan_targets) if new_target != "SMILES": template = template.replace("{SMILES", "{" + new_target) diff --git a/tests/data/test_sampler.py b/tests/data/test_sampler.py index 34452b678..3ad80a9ae 100644 --- a/tests/data/test_sampler.py +++ b/tests/data/test_sampler.py @@ -19,17 +19,30 @@ def sample_df(): } ) + @pytest.fixture def sample_multiple_identifier_df(): - return pd.DataFrame({ - 'SMILES': ['CC(C)NCC(O)c1ccc(O)c(O)c1', 'CC1=C(C(=O)NC2=C1C=CC=C2)C3=CC=CC=C3'], - 'selfies': ['[C][C][Branch1][C][C][N][C][C][Branch1][O][C][=C][C][=C][C][Branch1][O][C][=C][Branch1][O][C][=C]', '[C][C]1[=C][Branch1][C][Branch2][=O][N][C]2[=C]1[C][=C][C][=C][C]2[C]3[=C][C][=C][C][=C][C]3'], - 'inchi': ['InChI=1S/C11H17NO3/c1-8(2)12-6-9(13)10-4-3-5-11(14)7-10/h3-5,7-9,12-14H,6H2,1-2H3', 'InChI=1S/C15H11NO/c17-14-11-9-5-1-3-7-13(9)16-15(14)10-6-2-4-8-12(10)11/h1-8,16H'], - 'compound_name': ['Isoproterenol', 'Phenytoin'], - 'LogP': [0.08, 2.47], - 'is_active': [True, False], - 'split': ['train', 'test'] - }) + return pd.DataFrame( + { + "SMILES": [ + "CC(C)NCC(O)c1ccc(O)c(O)c1", + "CC1=C(C(=O)NC2=C1C=CC=C2)C3=CC=CC=C3", + ], + "selfies": [ + "[C][C][Branch1][C][C][N][C][C][Branch1][O][C][=C][C][=C][C][Branch1][O][C][=C][Branch1][O][C][=C]", + "[C][C]1[=C][Branch1][C][Branch2][=O][N][C]2[=C]1[C][=C][C][=C][C]2[C]3[=C][C][=C][C][=C][C]3", + ], + "inchi": [ + "InChI=1S/C11H17NO3/c1-8(2)12-6-9(13)10-4-3-5-11(14)7-10/h3-5,7-9,12-14H,6H2,1-2H3", + "InChI=1S/C15H11NO/c17-14-11-9-5-1-3-7-13(9)16-15(14)10-6-2-4-8-12(10)11/h1-8,16H", + ], + "compound_name": ["Isoproterenol", "Phenytoin"], + "LogP": [0.08, 2.47], + "is_active": [True, False], + "split": ["train", "test"], + } + ) + @pytest.fixture def sample_meta(): @@ -69,19 +82,28 @@ def sample_meta(): def sample_multiple_identifier_meta(): return { "targets": [ - {"id": "LogP", "type": "continuous", "description": "Logarithm of partition coefficient"}, - {"id": "is_active", "type": "categorical", "description": "Activity status"} + { + "id": "LogP", + "type": "continuous", + "description": "Logarithm of partition coefficient", + }, + { + "id": "is_active", + "type": "categorical", + "description": "Activity status", + }, ], "identifiers": [ {"id": "SMILES", "type": "SMILES", "description": "SMILES notation"}, - {"id": "compound_name", "type": "Other", "description": "Compound name"} + {"id": "compound_name", "type": "Other", "description": "Compound name"}, ], "templates": [ "The molecule with SMILES {SMILES#} has a LogP of {LogP#}.", - "The compound {compound_name#} is {is_active#active&inactive}." - ] + "The compound {compound_name#} is {is_active#active&inactive}.", + ], } + @pytest.fixture def sample_config(): return { @@ -504,21 +526,40 @@ def test_polymer_multiple_properties( assert "275.0" in result assert "0.90" in result -def test_additional_targets(sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config): - sampler = TemplateSampler(sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config) + +def test_additional_targets( + sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config +): + sampler = TemplateSampler( + sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config + ) assert set(sampler.additional_targets) == {"selfies", "inchi"} - print(sampler.meta['targets']) + print(sampler.meta["targets"]) assert len(sampler.meta["targets"]) == 4 - -def test_sample_with_random_replacement(sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config): - sampler = TemplateSampler(sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config) +def test_sample_with_random_replacement( + sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config +): + sampler = TemplateSampler( + sample_multiple_identifier_df, sample_multiple_identifier_meta, sample_config + ) template = "The compound with {SMILES__description} {SMILES#} has a {LogP__description} of {LogP#}" - results = [sampler.sample(sample_multiple_identifier_df.iloc[0], template) for _ in range(20)] + results = [ + sampler.sample(sample_multiple_identifier_df.iloc[0], template) + for _ in range(20) + ] smiles_count = sum("CC(C)NCC(O)c1ccc(O)c(O)c1" in r for r in results) - selfies_count = sum("[C][C][Branch1][C][C][N][C][C][Branch1][O][C][=C][C][=C][C][Branch1][O][C][=C][Branch1][O][C][=C]" in r for r in results) - inchi_count = sum("InChI=1S/C11H17NO3/c1-8(2)12-6-9(13)10-4-3-5-11(14)7-10/h3-5,7-9,12-14H,6H2,1-2H3" in r for r in results) + selfies_count = sum( + "[C][C][Branch1][C][C][N][C][C][Branch1][O][C][=C][C][=C][C][Branch1][O][C][=C][Branch1][O][C][=C]" + in r + for r in results + ) + inchi_count = sum( + "InChI=1S/C11H17NO3/c1-8(2)12-6-9(13)10-4-3-5-11(14)7-10/h3-5,7-9,12-14H,6H2,1-2H3" + in r + for r in results + ) assert smiles_count > 0 assert selfies_count > 0 assert inchi_count > 0