From 6e42a918e76289464260bffa4a0fbc859b28c84a Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Thu, 5 Oct 2023 09:39:30 +0200 Subject: [PATCH 1/2] add timing helper Fixes #17 --- src/chemlift/finetune/peftmodels.py | 9 +++++++++ src/chemlift/icl/fewshotpredictor.py | 6 ++++++ 2 files changed, 15 insertions(+) diff --git a/src/chemlift/finetune/peftmodels.py b/src/chemlift/finetune/peftmodels.py index 8abc2ca..012a2db 100644 --- a/src/chemlift/finetune/peftmodels.py +++ b/src/chemlift/finetune/peftmodels.py @@ -21,6 +21,7 @@ from functools import partial from peft.utils.save_and_load import set_peft_model_state_dict from fastcore.basics import basic_repr +import time class ChemLIFTClassifierFactory: @@ -125,8 +126,14 @@ def __init__( self.tune_settings["per_device_train_batch_size"] = self.batch_size + self._fine_tune_time = None + __repr__ = basic_repr(["property_name", "_base_model"]) + @property + def fine_tune_time(self): + return self._fine_tune_time + def _prepare_df(self, X: ArrayLike, y: ArrayLike): rows = [] for i in range(len(X)): @@ -255,6 +262,7 @@ def fit( dfs.append(formatted) formatted = pd.concat(dfs) + start_time = time.time() train_model( self.model, self.tokenizer, @@ -263,6 +271,7 @@ def fit( hub_model_name=None, report_to=None, ) + self._fine_tune_time = time.time() - start_time def _predict( self, diff --git a/src/chemlift/icl/fewshotpredictor.py b/src/chemlift/icl/fewshotpredictor.py index 11ed606..68c5312 100644 --- a/src/chemlift/icl/fewshotpredictor.py +++ b/src/chemlift/icl/fewshotpredictor.py @@ -7,6 +7,7 @@ import enum from typing import Union from chemlift.icl.utils import LangChainChatModelWrapper +import time class Strategy(enum.Enum): @@ -86,6 +87,11 @@ def __init__( self._materialclass = "molecules" self._max_test = max_test self._prefix = prefix + self._prediction_time = None + + @property + def prediction_time(self): + return self._prediction_time def _format_examples(self, examples, targets): """Format examples and targets into a string. From a9a117c73895760550e335c6e1b372c4dba2134c Mon Sep 17 00:00:00 2001 From: Kevin Maik Jablonka Date: Wed, 11 Oct 2023 18:48:48 +0200 Subject: [PATCH 2/2] add timing helper Fixes #17 --- experiments/scaling/pythia.py | 86 +++++++++++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 experiments/scaling/pythia.py diff --git a/experiments/scaling/pythia.py b/experiments/scaling/pythia.py new file mode 100644 index 0000000..7535ec0 --- /dev/null +++ b/experiments/scaling/pythia.py @@ -0,0 +1,86 @@ +from gptchem.data import get_photoswitch_data + +from chemlift.finetune.peftmodels import PEFTClassifier, ChemLIFTClassifierFactory +from sklearn.model_selection import train_test_split + +from fastcore.xtras import load_pickle, save_pickle +from gptchem.evaluator import evaluate_classification +import time +import os + + +def get_timestr(): + return time.strftime("%Y-%m-%d_%H-%M-%S") + + +models = [ + "EleutherAI/pythia-12b-deduped", + "EleutherAI/pythia-6.9b-deduped", + "EleutherAI/pythia-2.8b-deduped", + "EleutherAI/pythia-1.4b-deduped", + "EleutherAI/pythia-1b-deduped", + "EleutherAI/pythia-410m-deduped", + "EleutherAI/pythia-160m-deduped", + "EleutherAI/pythia-70m-deduped", +] + + +def train_test(train_size, model_name, random_state=42): + data = get_photoswitch_data() + + data = data.dropna(subset=["SMILES", "E isomer pi-pi* wavelength in nm"]) + + data["binned"] = data["E isomer pi-pi* wavelength in nm"].apply( + lambda x: 1 if x > data["E isomer pi-pi* wavelength in nm"].median() else 0 + ) + + train, test = train_test_split( + data, train_size=train_size, stratify=data["binned"], random_state=random_state + ) + + train_median = train["E isomer pi-pi* wavelength in nm"].median() + train["binned"] = train["E isomer pi-pi* wavelength in nm"].apply( + lambda x: 1 if x > train_median else 0 + ) + test["binned"] = test["E isomer pi-pi* wavelength in nm"].apply( + lambda x: 1 if x > train_median else 0 + ) + + model = ChemLIFTClassifierFactory( + "transition wavelength class", + model_name=model_name, + load_in_8bit=True, + inference_batch_size=32, + tokenizer_kwargs={"cutoff_len": 50}, + tune_settings={"num_train_epochs": 32}, + ).create_model() + + model.fit(train["SMILES"].values, train["binned"].values) + + start = time.time() + predictions = model.predict(test["SMILES"].values) + end = time.time() + + report = evaluate_classification(test["binned"].values, predictions) + + if not os.path.exists("results"): + os.makedirs("results") + + outname = f"results/{get_timestr()}_peft_{model_name}_{train_size}.pkl" + + report["model_name"] = model_name + report["train_size"] = train_size + report["random_state"] = random_state + report["predictions"] = predictions + report["targets"] = test["binned"].values + report["fine_tune_time"] = model.fine_tune_time + report["inference_time"] = end - start + + save_pickle(outname, report) + + +if __name__ == "__main__": + for seed in range(5): + for model in models: + for train_size in [10, 50, 100, 200, 300]: + train_test(train_size, model, random_state=seed)