From 448e6fea6b9bc722c0d039038112c5a2879766b2 Mon Sep 17 00:00:00 2001
From: "Aivin V. Solatorio" <avsolatorio@gmail.com>
Date: Tue, 3 Dec 2024 00:31:28 +0100
Subject: [PATCH] Expose system message and bump version (#32)

* Make system message controllable

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

* Bump version to v0.0.10

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>

---------

Signed-off-by: Aivin V. Solatorio <avsolatorio@gmail.com>
---
 llm4data/__init__.py                         | 2 +-
 llm4data/augmentation/microdata/theme_llm.py | 8 ++++++--
 pyproject.toml                               | 2 +-
 3 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/llm4data/__init__.py b/llm4data/__init__.py
index d710744..a9e71c9 100644
--- a/llm4data/__init__.py
+++ b/llm4data/__init__.py
@@ -7,7 +7,7 @@
 # Do this before importing any other modules.
 dotenv.load_dotenv()
 
-__version__ = "0.0.9"
+__version__ = "0.0.10"
 
 indicator2name = dict(
     wdi=json.load((Path(__file__).parent / "wdi2name.json").open("r"))
diff --git a/llm4data/augmentation/microdata/theme_llm.py b/llm4data/augmentation/microdata/theme_llm.py
index db32773..766c49d 100644
--- a/llm4data/augmentation/microdata/theme_llm.py
+++ b/llm4data/augmentation/microdata/theme_llm.py
@@ -87,7 +87,7 @@ class ThemeLLM(object):
     - an idno of a study in the form of a string. The data dictionary will be retrieved from a specified NADA catalog.
     """
 
-    def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictionary: Union[str, Path, dict] = None, catalog_url: str = None, vars_dir: Union[str, Path] = None, desc_dir: Union[str, Path] = None, force: bool = False, persist: bool = True):
+    def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictionary: Union[str, Path, dict] = None, catalog_url: str = None, vars_dir: Union[str, Path] = None, desc_dir: Union[str, Path] = None, force: bool = False, persist: bool = True, system_message: str = SYSTEM_MESSAGE):
         """
         Initialize the ThemeLLM object.
 
@@ -127,6 +127,7 @@ def __init__(self, idno: str, llm_model_id: str = "gpt-3.5-turbo", data_dictiona
 
         # Set the LLM model id.
         self.llm_model_id = llm_model_id
+        self.system_message = system_message
 
         # State variables
         self.clusters = None
@@ -251,7 +252,7 @@ def clustering(self, embeddings: np.ndarray, n_clusters: int = 20, n_components:
 
         return aggcl.fit_predict(tsvd.fit_transform(embeddings))
 
-    def generate_prompts(self, force: bool = False, system_message: str = SYSTEM_MESSAGE, max_input_tokens: int = 2500, system_num_tokens: int = 100, special_sep: str = SPECIAL_SEP):
+    def generate_prompts(self, force: bool = False, system_message: str = None, max_input_tokens: int = 2500, system_num_tokens: int = 100, special_sep: str = SPECIAL_SEP):
         """
         Generate the prompts for the microdata variables.
         """
@@ -261,6 +262,9 @@ def generate_prompts(self, force: bool = False, system_message: str = SYSTEM_MES
 
         idno_data = []
 
+        # Set the system message.
+        system_message = system_message or self.system_message
+
         for cluster in tqdm(self.clusters["cluster"].keys()):
             cluster_labels = self.clusters["cluster"][cluster]
             prompt = PromptZeros.build_message(
diff --git a/pyproject.toml b/pyproject.toml
index 418896e..b102553 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llm4data"
-version = "0.0.9"
+version = "0.0.10"
 description = "LLM4Data is a Python library designed to facilitate the application of large language models (LLMs) and artificial intelligence for development data and knowledge discovery."
 authors = ["Aivin V. Solatorio <avsolatorio@gmail.com>"]
 readme = "README.md"