Skip to content

Commit

Permalink
upload darwin benchmark
Browse files Browse the repository at this point in the history
  • Loading branch information
0xTong committed Jul 21, 2023
1 parent 2d6afb2 commit f573a9b
Show file tree
Hide file tree
Showing 5 changed files with 372 additions and 0 deletions.
Binary file added benchmarks/matbench_v0.1_darwin/.DS_Store
Binary file not shown.
22 changes: 22 additions & 0 deletions benchmarks/matbench_v0.1_darwin/info.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
{
"authors": "Tong Xie, Yuwei Wan, Wei Huang, Zhenyu Yin, Yixuan Liu, Shaozhou Wang, Qingyuan Linghu, Imran Razzak, Bram Hoex, Chunyu Kit, Wenjie Zhang",
"algorithm": "Darwin",
"algorithm_long": "Fine tuning DARWIN Natural Science Large Language Model",
"bibtex_refs": "@misc{xie2023large,\n title={Large Language Models as Master Key: Unlocking the Secrets of Materials Science with GPT},\n author={Tong Xie and Yuwei Wan and Wei Huang and Yufei Zhou and Yixuan Liu and Qingyuan Linghu and Shaozhou Wang and Chunyu Kit and Clara Grazian and Wenjie Zhang and Bram Hoex},\n year={2023},\n eprint={2304.02213},\n archivePrefix={arXiv},\n primaryClass={cs.CL}",
"notes": "We provide prompts and call-and-return of our model. The code for evaluating the benchmarks is available at https://github.com/MasterAI-EAM/Darwin-SIT, our base model is available at https://aigreendynamics-my.sharepoint.com/:f:/g/personal/yuwei_greendynamics_com_au/EvZEghuFSZZCguWrCsbk2QMB_eYqv-BRMM4VLhcK8TT4Zw?e=9bnqWW. To train our model, it requires at least 4*A100(80G)",
"requirements": {
"python": [
"git+https://github.com/MasterAI-EAM/Darwin.git",
"matbench==0.1.0",
"numpy",
"rouge_score",
"fire",
"openai",
"transformers>=4.28.1",
"torch",
"sentencepiece",
"tokenizers>=0.13.3",
"wandb"
]
}
}
233 changes: 233 additions & 0 deletions benchmarks/matbench_v0.1_darwin/preprocessing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,233 @@
import pandas as pd
import random
import json
def convert_gap(train_inputs,train_outputs=None,train=True):
input_df = pd.DataFrame(train_inputs)
if train:
output_df = pd.DataFrame(train_outputs)
df = input_df.join(output_df)
else:
df = input_df

slot = {"composition":"composition",
"band gap":"gap expt"}
pair1 = [("band gap", "composition")]
data_list = []
for df_slice in df.iterrows():
for p in pair1:
tmp_dict = {}
ran = random.randint(0,3)
if ran == 0:
question = "What is "+ p[0]+" of given "+ p[1] + "? ->"
elif ran == 1:
question = "Write "+ p[0]+ " of given "+ p[1] + ". ->"
elif ran == 2:
question = "Given " + p[1] + ", write its "+ p[0] + ". ->"
elif ran == 3:
question = "Tell me " + p[0] + " of given " + p[1] + ". ->"
elif ran == 4:
question = "Given " + p[1] + ", what is its "+ p[0] + "? ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = str(df_slice[1][slot[p[1]]])+"\n"
if train:
tmp_dict["output"] = " "+str(df_slice[1][slot[p[0]]])+"\n"
data_list.append(tmp_dict)

return data_list

def convert_metal(train_inputs,train_outputs=None,train=True):
input_df = pd.DataFrame(train_inputs)
if train:
output_df = pd.DataFrame(train_outputs)
df = input_df.join(output_df)
else:
df = input_df
# key=csv column name, value=slot in Q
slot = {"composition":"composition"}
data_list = []
for s in slot.keys():
for i, element in enumerate(df[s]):
tmp_dict = {}
ran = random.randint(0,2)
if ran == 0:
question = "Is composition metal? ->"
elif ran == 1:
question = "Is given composition metal? ->"
elif ran == 2:
question = "Given composition, is it metal? ->"

tmp_dict["instruction"] = question
tmp_dict["input"] = " "+ element +"\n"
if train:
answer = df['is_metal'][i]
if answer == True:
tmp_dict["output"] = f" Yes, {element} is metal.\n"
elif answer == False:
tmp_dict["output"] = f" No, {element} is not metal.\n"
data_list.append(tmp_dict)
if train:
# https://github.com/psobko/Common-English-Nouns
with open('2325_nouns.json', 'r', encoding='utf-8') as f:
nouns = json.load(f)

# add no-answer question according to length of data_list
add = int(len(data_list)/30)

add_nouns = random.sample(nouns, add)

for an in add_nouns:
tmp_dict = {}
ran1 = random.randint(0,2)
ran2 = random.sample(list(slot.values()), 1)[0]
if ran1 == 0:
question = "Is composition metal? ->"
elif ran1 == 1:
question = "Is given composition metal? ->"
elif ran1 == 2:
question = "Given composition, is it metal? ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = " " + an + "\n"
if train:
tmp_dict["output"] = " "+an+" is not a "+ran2+" and it is not metal.\n"
data_list.append(tmp_dict)
random.shuffle(data_list)
return data_list

def convert_steels(train_inputs,train_outputs=None,train=True):
input_df = pd.DataFrame(train_inputs)
if train:
output_df = pd.DataFrame(train_outputs)
df = input_df.join(output_df)
else:
df = input_df
# key=csv column name, value=slot in Q
slot = {"composition":"composition"}
data_list = []
for s in slot.keys():
for i, element in enumerate(df[s]):
tmp_dict = {}
ran = random.randint(0,2)

# What will be yield strength of composition at 800-1200 °C
if ran == 0:
question = "What will be the yield strength of given composition at 800-1200 °C? ->"
elif ran == 1:
question = "Write a possible yield strength of given composition at 800-1200 °C. ->"
elif ran == 2:
question = "Given composition, write its potential yield strength at 800-1200 °C. ->"

tmp_dict["instruction"] = question
tmp_dict["input"] = " "+ element +"\n"
if train:
answer = df['yield strength'][i]
tmp_dict["output"] = " "+str(answer)+"\n"
data_list.append(tmp_dict)
if train:
# https://github.com/psobko/Common-English-Nouns
with open('2325_nouns.json', 'r', encoding='utf-8') as f:
nouns = json.load(f)
# add no-answer question according to length of data_list
add_c = int(len(data_list)/50)
add_n = int(len(data_list)/30)

add_comps = random.sample(df['composition'].tolist(), add_c)


for an in add_comps:
tmp_dict = {}
ran1 = random.randint(0,2)
ran2 = random.sample(list(slot.values()), 1)[0]
if ran1 == 0:
question = "What is yield strength of composition? ->"
elif ran1 == 1:
question = "Write a possible yield strength of given composition. ->"
elif ran1 == 2:
question = "Given composition, write its potential yield strength. ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = " "+ an + ".\n"
if train:
tmp_dict["output"] = " Unable to answer due to lack of conditions.\n"
data_list.append(tmp_dict)
# add no-answer question according to length of data_list
add_nouns = random.sample(nouns, add_n)

for an in add_nouns:
tmp_dict = {}
ran1 = random.randint(0,2)
ran2 = random.sample(list(slot.values()), 1)[0]
if ran1 == 0:
question = "What will be the yield strength of given composition at 800-1200 °C? ->"
elif ran1 == 1:
question = "Write a possible yield strength of given composition at 800-1200 °C. ->"
elif ran1 == 2:
question = "Given composition, write its potential yield strength at 800-1200 °C. ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = an
if train:
tmp_dict["output"] = " "+an+" is not a "+ran2+" and it does not have yield strength.\n"
data_list.append(tmp_dict)
random.shuffle(data_list)
return data_list

def convert_glass(train_inputs,train_outputs=None,train=True):
input_df = pd.DataFrame(train_inputs)
if train:
output_df = pd.DataFrame(train_outputs)
df = input_df.join(output_df)
else:
df = input_df
slot = {"composition":"composition",
"glass formation ability":"gfa"}
pair1 = [("composition", "glass formation ability")]
data_list = []
for df_slice in df.iterrows():
for p in pair1:
tmp_dict = {}
ran = random.randint(0,1)
ran1 = random.randint(0,1)
if ran1 == 0:
form = "glass formation ability"
elif ran1 == 1:
form = "glass-forming ability"
if ran == 0:
question = "Does given "+ p[0]+" have "+ form + "? ->"
elif ran == 1:
question = "Tell me if given "+ p[0]+ " has "+ form + ". ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = " "+str(df_slice[1][slot[p[0]]])+"\n"
if train:
if str(df_slice[1][slot[p[1]]]) == "True":
answer = " Yes, " + str(df_slice[1][slot[p[0]]]) + " has "+form+".\n"
else:
answer = " No, "+ str(df_slice[1][slot[p[0]]]) + " does not have "+form+".\n"
tmp_dict["output"] = answer
data_list.append(tmp_dict)
if train:
# https://github.com/psobko/Common-English-Nouns
with open('2325_nouns.json', 'r', encoding='utf-8') as f:
nouns = json.load(f)

# add no-answer question according to length of data_list
add = int(len(data_list)/30)

add_nouns = random.sample(nouns, add)

for an in add_nouns:
tmp_dict = {}
ran = random.randint(0,1)
ran1 = random.randint(0,1)
if ran1 == 0:
form = "glass formation ability"
elif ran1 == 1:
form = "glass-forming ability"
if ran == 0:
question = "Does given "+ p[0]+" have "+ form + "? ->"
elif ran == 1:
question = "Tell me if given "+ p[0]+ " has "+ form + ". ->"
tmp_dict["instruction"] = question
tmp_dict["input"] = " "+an+"\n"
if train:
tmp_dict["output"] = " "+an+" is not a composition and it has no relation with "+form+".\n"
data_list.append(tmp_dict)
random.shuffle(data_list)
return data_list
1 change: 1 addition & 0 deletions benchmarks/matbench_v0.1_darwin/results.json.gz

Large diffs are not rendered by default.

116 changes: 116 additions & 0 deletions benchmarks/matbench_v0.1_darwin/run.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
from matbench.bench import MatbenchBenchmark
from preprocessing import *
import random
import json
import os
random.seed(0)

################################ Darwin Installation ################################

'''
Please clone and install the Darwin package
1) git clone https://github.com/MasterAI-EAM/Darwin.git
2) pip install -r requirements.txt
3) download the base Darwin model from https://aigreendynamics-my.sharepoint.com/:f:/g/personal/yuwei_greendynamics_com_au/EvZEghuFSZZCguWrCsbk2QMB_eYqv-BRMM4VLhcK8TT4Zw?e=9bnqWW
Our base model is built upon LLaMA-7b, trained with 9 datasets: Chembl, ESOL, MoosaviCp, MoosaviDiversity, NagasawaOPV, OPV, Pei, WaterStability
'''

mb = MatbenchBenchmark(
autoload=True,
subset=[
"matbench_expt_is_metal",
"matbench_steels",
"matbench_glass",
"matbench_expt_gap"
],
)

data_dir = 'train_test_data'
os.makedirs(data_dir)
os.makedirs('matbench_model')
fold_data = {0:[],1:[],2:[],3:[],4:[]}

for task in mb.tasks:
task.load()
task_name = task.dataset_name
for fold in task.folds:
# prepare the data for Darwin
train_inputs, train_outputs = task.get_train_and_val_data(fold)
test_inputs,test_outputs = task.get_test_data(fold,include_target=True)
# trainsform data into natural language
if (task.dataset_name == 'matbench_expt_gap'):
training_data = convert_gap(train_inputs,train_outputs)
test_data = convert_gap(test_inputs,train=False)
if (task.dataset_name == 'matbench_expt_is_metal'):
training_data = convert_metal(train_inputs,train_outputs)
test_data = convert_metal(test_inputs,train=False)
if (task.dataset_name == 'matbench_steels'):
training_data = convert_steels(train_inputs,train_outputs)
test_data = convert_steels(test_inputs,train=False)
if (task.dataset_name == 'matbench_glass'):
training_data = convert_glass(train_inputs,train_outputs)
test_data = convert_glass(test_inputs,train=False)
# mix 4 tasks into a single training set
fold_data[fold]+=training_data

# create test dataset
with open(data_dir +'/matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test.json','w') as f:
json.dump(test_data,f)

# creating the training dataset, training and evaluating the model
for fold in fold_data:

# creating training dataset
training_data = fold_data[fold]
random.shuffle(training_data)
data_path = data_dir +'/matbench_base_fold_'+str(fold)+'_train.json'
output_path = 'matbench_model/fold'+str(fold)
with open(data_path,'w') as f:
json.dump(training_data,f)
# train the model
os.system("torchrun --nproc_per_node=8 --master_port=1212 train.py \
--model_name_or_path base_model \
--data_path" + data_path + " \
--bf16 True \
--output_dir" + output_path + " \
--num_train_epochs 5 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 2 \
--evaluation_strategy 'no' \
--save_strategy 'steps' \
--save_steps 500 \
--save_total_limit 1 \
--learning_rate 2e-5 \
--weight_decay 0. \
--warmup_ratio 0.03 \
--lr_scheduler_type 'cosine' \
--logging_steps 1 \
--fsdp 'full_shard auto_wrap' \
--fsdp_transformer_layer_cls_to_wrap 'LlamaDecoderLayer' \
--tf32 False")
# evaluate the model

for task in mb.tasks:
test_inputs,test_outputs = task.get_test_data(fold,include_target=True)
test_data_path = data_dir +'/matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test.json'
os.system(f"python evaluate_matbench.py \
--model_path {output_path} \
--data_path {data_dir}/matbench_base_fold_{str(fold)}_{task.dataset_name}_test.json \
--dataset {task.dataset_name} \
--fold {fold}")

# load the prediction result
with open('matbench_base_fold_'+str(fold)+'_'+task.dataset_name+'_test_result.json') as f:
data = json.load(f)
transformed_data = {}
for item in data:
transformed_data[item['input'].strip()] = item['output']
predicted_output = []
for i in range(len(test_inputs)):
predicted_output.append(transformed_data[test_inputs[i]])
task.record(fold,predicted_output)

# save the result
mb.to_file("results.json.gz")

0 comments on commit f573a9b

Please sign in to comment.