From 448e6fea6b9bc722c0d039038112c5a2879766b2 Mon Sep 17 00:00:00 2001 From: "Aivin V. Solatorio" Date: Tue, 3 Dec 2024 00:31:28 +0100 Subject: [PATCH] Expose system message and bump version (#32) * Make system message controllable Signed-off-by: Aivin V. Solatorio * Bump version to v0.0.10 Signed-off-by: Aivin V. Solatorio --------- Signed-off-by: Aivin V. Solatorio --- llm4data/__init__.py | 2 +- llm4data/augmentation/microdata/theme_llm.py | 8 ++++++-- pyproject.toml | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/llm4data/__init__.py b/llm4data/__init__.py index d710744..a9e71c9 100644 --- a/llm4data/__init__.py +++ b/llm4data/__init__.py @@ -7,7 +7,7 @@ # Do this before importing any other modules. dotenv.load_dotenv() -__version__ = "0.0.9" +__version__ = "0.0.10" indicator2name = dict( wdi=json.load((Path(__file__).parent / "wdi2name.json").open("r")) diff --git a/llm4data/augmentation/microdata/theme_llm.py b/llm4data/augmentation/microdata/theme_llm.py index db32773..766c49d 100644 --- a/llm4data/augmentation/microdata/theme_llm.py +++ b/llm4data/augmentation/microdata/theme_llm.py @@ -87,7 +87,7 @@ class ThemeLLM(object): - an idno of a study in the form of a string. The data dictionary will be retrieved from a specified NADA catalog. """ - def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictionary: Union[str, Path, dict] = None, catalog_url: str = None, vars_dir: Union[str, Path] = None, desc_dir: Union[str, Path] = None, force: bool = False, persist: bool = True): + def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictionary: Union[str, Path, dict] = None, catalog_url: str = None, vars_dir: Union[str, Path] = None, desc_dir: Union[str, Path] = None, force: bool = False, persist: bool = True, system_message: str = SYSTEM_MESSAGE): """ Initialize the ThemeLLM object. @@ -127,6 +127,7 @@ def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictiona # Set the LLM model id. self.llm_model_id = llm_model_id + self.system_message = system_message # State variables self.clusters = None @@ -251,7 +252,7 @@ def clustering(self, embeddings: np.ndarray, n_clusters: int = 20, n_components: return aggcl.fit_predict(tsvd.fit_transform(embeddings)) - def generate_prompts(self, force: bool = False, system_message: str = SYSTEM_MESSAGE, max_input_tokens: int = 2500, system_num_tokens: int = 100, special_sep: str = SPECIAL_SEP): + def generate_prompts(self, force: bool = False, system_message: str = None, max_input_tokens: int = 2500, system_num_tokens: int = 100, special_sep: str = SPECIAL_SEP): """ Generate the prompts for the microdata variables. """ @@ -261,6 +262,9 @@ def generate_prompts(self, force: bool = False, system_message: str = SYSTEM_MES idno_data = [] + # Set the system message. + system_message = system_message or self.system_message + for cluster in tqdm(self.clusters["cluster"].keys()): cluster_labels = self.clusters["cluster"][cluster] prompt = PromptZeros.build_message( diff --git a/pyproject.toml b/pyproject.toml index 418896e..b102553 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "llm4data" -version = "0.0.9" +version = "0.0.10" description = "LLM4Data is a Python library designed to facilitate the application of large language models (LLMs) and artificial intelligence for development data and knowledge discovery." authors = ["Aivin V. Solatorio "] readme = "README.md"