diff --git a/README.md b/README.md index 38de6ac..6212e87 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ venv\bin\activate pip uninstall -r requirements.txt -y pip install -r requirements.txt +git pull --all +git merge --strategy=recursive --strategy-option=ours --no-ff origin/dev ``` diff --git a/chatting.py b/chatting.py deleted file mode 100644 index 4382df6..0000000 --- a/chatting.py +++ /dev/null @@ -1,74 +0,0 @@ -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.chatting.ChattingClass import ChattingClass -from src.mongodb.MongoDBClass import MongoDBClass - -def chatting(args): - """ - Main entry point for the chatting process - Args: - args: Command line arguments - """ - - # Load payload data from the provided directory - payload_data = read_json(args.payload_dir) - - # Extract MongoDB URI from payload data - mongo_uri = payload_data["mongo_uri"] - - # Create an instance of MongoDBClass for database operations - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=mongo_uri) - - # Check if the API key is valid using MongoDB - is_available = mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) - - if is_available: - print("valid api key") - # Initialize the ChattingClass instance for conversation - chatting = ChattingClass( - data_path=payload_data["data_path"], - api_key=payload_data["api_key"], - model_id=payload_data["model_id"], - temperature=payload_data["temperature"]) - - # Ask a question using the ChattingClass instance and get the response - response = chatting.ask_question(args.question) - print(response) - else: - print("invalide api key") - - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test003" - payload_name = "chatting_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - user = "user@gmail.com" - api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA1" - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Conversational Agent.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - p.add_argument("--question", type=str) - p.add_argument("--user", type=Path, default=user, help="User Email") - p.add_argument("--api_key", type=Path, default=api_key, help="API key") - args = p.parse_args() - - # Call the chatting function with the parsed arguments - chatting(args) diff --git a/check_api_key.py b/check_api_key.py deleted file mode 100644 index de8fd6f..0000000 --- a/check_api_key.py +++ /dev/null @@ -1,60 +0,0 @@ - -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass - - -def check_api_key(args): - """ - Main function to check the validation of an API key - Args: - - args (argparse.Namespace): Parsed command-line arguments - """ - - # Load payload data from a JSON file - payload_data = read_json(args.payload_dir) - - # Extract MongoDB URI from payload data - mongo_uri = payload_data["mongo_uri"] - - # Create an instance of MongoDBClass to interact with the database - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=mongo_uri) - - # Check the validation of the API key and the user - mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) - - # Perform garbage collection to free up memory - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "mongodb_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - user = "user@gmail.com" - api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA" - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Check the API key.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - p.add_argument("--user", type=Path, default=user, help="User Email") - p.add_argument("--api_key", type=Path, default=api_key, help="API key") - args = p.parse_args() - - # Call the check_api_key function with the parsed arguments - check_api_key(args) \ No newline at end of file diff --git a/create_api_key.py b/create_api_key.py deleted file mode 100644 index 298456b..0000000 --- a/create_api_key.py +++ /dev/null @@ -1,79 +0,0 @@ - -import os -import gc -import argparse -from pathlib import Path -from datetime import datetime - -from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass -from src.utils.utils_funcs import generate_api_key -from src.models.api_model import APIModel - -def create_api_key(args): - """ - Main function to create an API key and store it in a MongoDB database. - - Args: - - args (argparse.Namespace): Parsed command-line arguments - """ - - # Load payload data from a JSON file - payload_data = read_json(args.payload_dir) - - # Extract MongoDB URI from payload data - mongo_uri = payload_data["mongo_uri"] - - # Create an instance of MongoDBClass to interact with the database - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=mongo_uri) - - # Generate a new API key - api_key = generate_api_key() - - # Prepare the data for the new API using APIModel - data:APIModel = { - "user": str(Path(args.user)), - "api": api_key, - "title": str(Path(args.title)), - "description": str(Path(args.description)), - "is_removed": False, - "created_at": datetime.now(), - "updated_at": datetime.now(), - } - - # Store the API key data in the MongoDB database - mongodb.create_api(data) - - # Perform garbage collection to free up memory - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "mongodb_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - user = "user@gmail.com" - title = "title" - description = "description" - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Create API key.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - p.add_argument("--user", type=Path, default=user, help="User Email") - p.add_argument("--title", type=Path, default=title, help="Title") - p.add_argument("--description", type=Path, default=description, help="Description") - args = p.parse_args() - - # Call the create_api_key function with the parsed arguments - create_api_key(args) \ No newline at end of file diff --git a/delete_api_key.py b/delete_api_key.py deleted file mode 100644 index 4da4a9a..0000000 --- a/delete_api_key.py +++ /dev/null @@ -1,59 +0,0 @@ - -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass - -def delete_api_key(args): - """ - Main function to delete an API key from the MongoDB database collection. - - Args: - - args (argparse.Namespace): Parsed command-line arguments - """ - - # Load payload data from a JSON file - payload_data = read_json(args.payload_dir) - - # Extract MongoDB URI from payload data - mongo_uri = payload_data["mongo_uri"] - - # Create an instance of MongoDBClass to interact with the database - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=mongo_uri) - - # Delete the API key identified by the user and the API key value - mongodb.delete_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) - - # Perform garbage collection to free up memory - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "mongodb_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - user = "user@gmail.com" - api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA" - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Delete API key.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - p.add_argument("--user", type=Path, default=user, help="User Email") - p.add_argument("--api_key", type=Path, default=api_key, help="API key") - args = p.parse_args() - - delete_api_key(args) \ No newline at end of file diff --git a/finetuning.py b/finetuning.py deleted file mode 100644 index 4db1d0f..0000000 --- a/finetuning.py +++ /dev/null @@ -1,60 +0,0 @@ -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.finetune.FineTuningClass import FineTuningClass - -def run_fine_tuning(args): - """ - Main function to execute the fine-tuning process using the provided payload. - - Args: - - args (argparse.Namespace): Parsed command-line arguments - """ - - # Load payload data from a JSON file - payload_data = read_json(args.payload_dir) - - # Create an instance of FineTuningClass to handle the fine-tuning process - fine_tune = FineTuningClass( - data_path=payload_data["data_path"], - parent_path=payload_data["parent_path"], - api_key=payload_data["api_key"], - model=payload_data["model"], - temperature=payload_data["temperature"], - max_retries=payload_data["max_retries"]) - - # Generate the train and eval data - fine_tune.train_generation() - - # Generate the jsonl - fine_tune.jsonl_generation() - - # Fine tuning - fine_tune.finetune() - - # Perform garbage collection to free up memory - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test003" - payload_name = "finetuning_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Fine tuning.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - args = p.parse_args() - - # Call the run_fine_tuning function with the parsed arguments - run_fine_tuning(args) diff --git a/img2txt_parallel.py b/img2txt_parallel.py deleted file mode 100644 index 45c96ec..0000000 --- a/img2txt_parallel.py +++ /dev/null @@ -1,197 +0,0 @@ - -import os -import gc -import time -import argparse -from pathlib import Path -import concurrent.futures -from datetime import datetime - -from src.utils.read_json import read_json -from src.utils.image_translator import ImageTranslator -from src.utils.chatgpt_communicator import ChatGPTCommunicator - - -def main(args): - """ - The main entry point for the image to text conversion process. - - Args: - - args (argparse.Namespace): Parsed command-line arguments - """ - # Timer - start_time = time.time() - - # Payload - payload_data = read_json(args.payload_dir) - - # Read images from the image directory - image_data_path = payload_data["images_data_path"] - image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")] - - # Create an instance of ImageTranslator for image encoding and translation - img_translator = ImageTranslator(api_key=payload_data["api_key"]) - - # Loop over number of images and append all images - # NOTE: User can upload image and add image URLs or just upload image or just add image URLs - images = [] - if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # Encode image - base64_image = img_translator.encode_image(image_path) - images.append((base64_image, False, "auto")) - for img_url in payload_data["image_url"]: - images.append((img_url, True, "auto")) - elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # Encode image - base64_image = img_translator.encode_image(image_path) - images.append((base64_image, False, "auto")) - elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0): - for img_url in payload_data["image_url"]: - images.append((img_url, True, "auto")) - - for image in images: - if payload_data["is_parallel"]: - params = [{ - img_translator: img_translator, - image: image - }] * payload_data["parallel_count"] - - with concurrent.futures.ThreadPoolExecutor() as executor: - results = list(executor.map(lambda args: img2txt(*args), params)) - - result = make_one_result(payload_data, results) - else: - result = img2txt(img_translator, image) - - save_to_txt(payload_data, result) - - - # Write into log file - end_time = time.time() - msg = f"Total processing time: {end_time - start_time} seconds" - print(msg) - - # Delete class objects and clean the buffer memory using the garbage collection - gc.collect() - -def save_to_txt(payload_data, result: str): - """ - Save the result to a text file. - - Args: - - payload_data (dict): Payload data - - result (str): The result to be saved - """ - current_time = datetime.now().strftime('%y_%m_%d_%H_%M_%S') - train_path = os.path.join(payload_data["data_path"], "train_data") - os.makedirs(train_path, exist_ok=True) # This line will create the directory if it doesn't exist - - with open(f'{train_path}/{current_time}_data.txt', "a", encoding="utf-8") as f: - f.write(result + "\n\n") # Append the new data to the end of the file - -def img2txt(img_translator: ImageTranslator, image): - """ - Process image to text using the ImageTranslator instance. - - Args: - - img_translator (ImageTranslator): Instance of ImageTranslator - - image (str): Image data - - Returns: - - str: Translated text from the image - """ - max_retries = 5 - last_error = "" - - img_translator_response = None # Define the variable and initialize it to None - - for attempt in range(max_retries): - try: - response = img_translator.analyze_images([image]) - - if "choices" in response and response["choices"]: - first_choice = response["choices"][0] - if "message" in first_choice and "content" in first_choice["message"] and first_choice["message"]["content"]: - img_translator_response = first_choice["message"]["content"] - break # Successful response, break out of the loop - else: - last_error = "No valid content in the response." - else: - last_error = "The response structure is not as expected." - - except Exception as e: - last_error = f"Attempt {attempt + 1} failed: {e}" - - if img_translator_response: - break # If a successful response is obtained, exit the loop - - if img_translator_response is None: - raise Exception("Failed to get a valid response after " + str(max_retries) + " attempts. Last error: " + last_error) - - return img_translator_response - -def make_one_result(payload_data, results: [str]): - """ - Combine and process the results using ChatGPT. - - Args: - - payload_data (dict): Payload data - - results (list): List of results from image processing - - Returns: - - str: Final result after processing with ChatGPT - """ - response = payload_data["merge_prompt"] - for index, result in enumerate(results): - response += f"\nresult {index + 1}: {result}" - - # Create chatGPT communicator - chatgpt_communicator = ChatGPTCommunicator(api_key=payload_data["api_key"], language_model=payload_data["language_model"]) - - # Start conversation with ChatGPT using the transcribed or translated text - chatgpt_communicator.create_chat(response) - - # Get conversation with ChatGPT - max_retries = 3 - chatgpt_response = None - - for attempt in range(max_retries): - try: - chatgpt_response = chatgpt_communicator.get_response() - # Check if the response is valid (not None and not empty) - if chatgpt_response: - break # Valid response, break out of the loop - except Exception as e: - print(f"Attempt {attempt + 1} failed: {e}") - if attempt == max_retries - 1: - raise Exception(f"Failed to get a valid response from ChatGPT after {max_retries} attempts. Last error: {e}") - - # Print response and use it somewhere else - # print(chatgpt_response) - - - return chatgpt_response - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "img2txt_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Image to Text with parallel.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - args = p.parse_args() - - main(args) diff --git a/main.py b/main.py deleted file mode 100644 index c34a6cc..0000000 --- a/main.py +++ /dev/null @@ -1,271 +0,0 @@ - -import os -import gc -import time -import argparse -from pathlib import Path -import concurrent.futures -from datetime import datetime -import json - -from src.utils.read_json import read_json -from src.utils.image_translator import ImageTranslator -from src.utils.chatgpt_communicator import ChatGPTCommunicator -from src.pdf2img.Pdf2ImgClass import Pdf2ImgClass -from src.finetune.FineTuningClass import FineTuningClass -from src.mathpix.Mathpix import Mathpix -from src.mongodb.MongoDBClass import MongoDBClass - -from src.utils.utils_funcs import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage - -def total_process(args): - start_time = time.time() - - payload_data = read_json(args.payload_dir) - - # Extract MongoDB URI from payload data - mongo_uri = payload_data["mongo_uri"] - - # Call class instance - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=mongo_uri) - - is_available = mongodb.check_validation_api(api_key=str(Path(args.api_key)), user=str(Path(args.user))) - - if is_available: - print("valid api key") - # Separate the data - separate_data(payload_data["data_path"], payload_data["threshold_image_percent_of_pdf"]) - - # pdf to image feature - pdf2img = Pdf2ImgClass( - data_path=payload_data["pdf_data_path"], - parent_path=payload_data["data_path"]) - - pdf2img.pdf2img() - - # img to text feature - # Read images from the image directory - image_list = [] - image_data_path = payload_data["images_data_path"] - - try: - image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")] - except FileNotFoundError: - print("The specified path does not exist or is inaccessible.") - - # Call class instance - img_translator = ImageTranslator(api_key=payload_data["api_key"]) - mathpix = Mathpix(mathpix_app_id=payload_data["mathpix_app_id"], mathpix_app_key=payload_data["mathpix_app_key"]) - - # Loop over number of images and append all images - # NOTE: User can upload image and add image URLs or just upload image or just add image URLs - images = [] - image_paths = [] - if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # Encode image - base64_image = img_translator.encode_image(image_path) - images.append((base64_image, False, "auto")) - image_paths.append(image_path) - for img_url in payload_data["image_url"]: - images.append((img_url, True, "auto")) - image_paths.append(img_url) - elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # Encode image - base64_image = img_translator.encode_image(image_path) - images.append((base64_image, False, "auto")) - image_paths.append(image_path) - elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0): - for img_url in payload_data["image_url"]: - images.append((img_url, True, "auto")) - image_paths.append(img_url) - - if payload_data["is_gpt"]: - for image in images: - if payload_data["is_parallel"]: - params = [{ - img_translator: img_translator, - image: image - }] * payload_data["parallel_count"] - - with concurrent.futures.ThreadPoolExecutor() as executor: - results = list(executor.map(lambda args: img2txt(*args), params)) - - result = make_one_result(payload_data, results) - else: - result = img2txt(img_translator, image) - - save_to_txt(payload_data, result) - else: - for path in image_paths: - result = mathpix.latex({ - 'src': mathpix.image_uri(path), - 'ocr': ['math', 'text'], - 'formats': ['text', 'latex_styled', 'asciimath', 'mathml', 'latex_simplified'], - 'format_options': { - 'text': { - 'transforms': ['rm_spaces', 'rm_newlines'], - 'math_delims': ['$', '$'] - }, - 'latex_styled': {'transforms': ['rm_spaces']} - } - }) - - # print(json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) - - save_to_txt(payload_data, json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) - - # fine tuning - fine_tune = FineTuningClass( - data_path=payload_data["train_data_path"], - parent_path=payload_data["data_path"], - api_key=payload_data["api_key"], - model=payload_data["model"], - temperature=payload_data["temperature"], - max_retries=payload_data["max_retries"]) - - # Generate the train and eval data - fine_tune.train_generation() - - # Generate the jsonl - fine_tune.jsonl_generation() - - # Fine tuning - fine_tune.finetune() - - # Write into log file - end_time = time.time() - msg = f"Total processing time: {end_time - start_time} seconds" - print(msg) - else: - print("invalide api key") - - gc.collect() - -def save_to_txt(payload_data, result: str): - current_time = datetime.now().strftime('%y_%m_%d_%H_%M_%S') - train_path = os.path.join(payload_data["data_path"], "train_data") - os.makedirs(train_path, exist_ok=True) # This line will create the directory if it doesn't exist - - with open(f'{train_path}/{current_time}_data.txt', "a", encoding="utf-8") as f: - f.write(result + "\n\n") # Append the new data to the end of the file - -def img2txt(img_translator: ImageTranslator, image): - max_retries = 5 - last_error = "" - - img_translator_response = None # Define the variable and initialize it to None - - for attempt in range(max_retries): - try: - response = img_translator.analyze_images([image]) - - if "choices" in response and response["choices"]: - first_choice = response["choices"][0] - if "message" in first_choice and "content" in first_choice["message"] and first_choice["message"]["content"]: - img_translator_response = first_choice["message"]["content"] - break # Successful response, break out of the loop - else: - last_error = "No valid content in the response." - else: - last_error = "The response structure is not as expected." - - except Exception as e: - last_error = f"Attempt {attempt + 1} failed: {e}" - - if img_translator_response: - break # If a successful response is obtained, exit the loop - - if img_translator_response is None: - raise Exception("Failed to get a valid response after " + str(max_retries) + " attempts. Last error: " + last_error) - - return img_translator_response - -def make_one_result(payload_data, results: [str]): - response = payload_data["merge_prompt"] - for index, result in enumerate(results): - response += f"\nresult {index + 1}: {result}" - - # Create chatGPT communicator - chatgpt_communicator = ChatGPTCommunicator(api_key=payload_data["api_key"], language_model=payload_data["language_model"]) - - # Start conversation with ChatGPT using the transcribed or translated text - chatgpt_communicator.create_chat(response) - - # Get conversation with ChatGPT - max_retries = 3 - chatgpt_response = None - - for attempt in range(max_retries): - try: - chatgpt_response = chatgpt_communicator.get_response() - # Check if the response is valid (not None and not empty) - if chatgpt_response: - break # Valid response, break out of the loop - except Exception as e: - print(f"Attempt {attempt + 1} failed: {e}") - if attempt == max_retries - 1: - raise Exception(f"Failed to get a valid response from ChatGPT after {max_retries} attempts. Last error: {e}") - - # Print response and use it somewhere else - # print(chatgpt_response) - - - return chatgpt_response - -def separate_data(path, threshold): - source_folder = path - images_folder = os.path.join(path, "images") - pdf_folder = os.path.join(path, "pdf") - train_folder = os.path.join(path, "train_data") - - file_list = os.listdir(source_folder) - for file_name in file_list: - file_path = os.path.join(source_folder, file_name) - if os.path.isfile(file_path): - if is_image_file(file_path): - copy_file_to_folder(file_path, images_folder) - elif is_text_file(file_path): - copy_file_to_folder(file_path, train_folder) - elif is_pdf_file(file_path): - # if check_pdf_content(file_path) == "text": - # copy_file_to_folder(file_path, train_folder) - # if has_text(file_path): - # copy_file_to_folder(file_path, train_folder) - if get_image_pages_percentage(file_path) < threshold: - # pdf is mostly consist of text - copy_file_to_folder(file_path, train_folder) - else: - # pdf is mostly consist of image - copy_file_to_folder(file_path, pdf_folder) - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - user = "user@gmail.com" - api_key = "AMEYbpdcmrUxNu_Fb80qutukUZdlsmYiH4g7As5LzNA1" - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Total Process.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - p.add_argument("--user", type=Path, default=user, help="User Email") - p.add_argument("--api_key", type=Path, default=api_key, help="API key") - args = p.parse_args() - - total_process(args) \ No newline at end of file diff --git a/mathpix.py b/mathpix.py deleted file mode 100644 index eb430c1..0000000 --- a/mathpix.py +++ /dev/null @@ -1,104 +0,0 @@ -import os -import gc -import time -import json -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.mathpix.Mathpix import Mathpix - -def mathpix(args): - """ - main entry point - """ - - # Payload - payload_data = read_json(args.payload_dir) - - # Read images from the image directory - image_data_path = payload_data["images_data_path"] - image_list = [img for img in os.listdir(image_data_path) if img.endswith(".png") or img.endswith(".jpeg") or img.endswith(".jpg")] - - # Call class instance - mathpix_api = Mathpix(mathpix_app_id=payload_data["mathpix_app_id"], mathpix_app_key=payload_data["mathpix_app_key"]) - - # Loop over number of images and append all images - # NOTE: User can upload image and add image URLs or just upload image or just add image URLs - images = [] - if (len(image_list) > 0) and (len(payload_data["image_url"]) > 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # # Encode image - # base64_image = mathpix_api.encode_image(image_path) - # images.append((base64_image, False, "auto")) - images.append(image_path) - # images.append((image_path, True, "auto")) - for img_url in payload_data["image_url"]: - images.append(img_url) - # images.append((img_url, True, "auto")) - elif (len(image_list) > 0) and (len(payload_data["image_url"]) == 0): - for image in image_list: - image_path = os.path.join(image_data_path, image) - # Encode image - # base64_image = mathpix_api.encode_image(image_path) - # images.append((base64_image, False, "auto")) - images.append(image_path) - # images.append((image_path, True, "auto")) - elif (len(image_list) == 0) and (len(payload_data["image_url"]) > 0): - for img_url in payload_data["image_url"]: - images.append(img_url) - # images.append((img_url, True, "auto")) - - - # Loop over number of requests - for image in images: - print("ssss", image) - # Timer - start_time = time.time() - - # Instantiate class - result = mathpix_api.latex({ - 'src': mathpix_api.image_uri(image), - 'ocr': ['math', 'text'], - 'formats': ['text', 'latex_styled', 'asciimath', 'mathml', 'latex_simplified'], - 'format_options': { - 'text': { - 'transforms': ['rm_spaces', 'rm_newlines'], - 'math_delims': ['$', '$'] - }, - 'latex_styled': {'transforms': ['rm_spaces']} - } - }) - - print(json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) - - # Print time - end_time = time.time() - msg = f"Total processing time for payload {end_time - start_time} seconds" - print(msg) - - - # Delete class objects and clean the buffer memory using the garbage collection - gc.collect() - - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "img2txt_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - # Set up command line argument parser - p = argparse.ArgumentParser(description="Convert image to text using MathPIX API.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - args = p.parse_args() - - mathpix(args) diff --git a/mongodb.py b/mongodb.py deleted file mode 100644 index f8e3f1e..0000000 --- a/mongodb.py +++ /dev/null @@ -1,46 +0,0 @@ - -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass - -def mongodb(args): - """ - main entry point - """ - - # Payload - payload_data = read_json(args.payload_dir) - - # Call class instance - mongodb = MongoDBClass( - db_name=payload_data["db_name"], - collection_name=payload_data["collection_name"], - mongo_uri=payload_data["mongo_uri"]) - - mongodb.mongo_connect() - - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "mongodb_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - # Set up command line argument parser - p = argparse.ArgumentParser(description="MongoDB Connection.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - args = p.parse_args() - - mongodb(args) \ No newline at end of file diff --git a/pdf2img.py b/pdf2img.py deleted file mode 100644 index a33a77e..0000000 --- a/pdf2img.py +++ /dev/null @@ -1,45 +0,0 @@ - -import os -import gc -import argparse -from pathlib import Path - -from src.utils.read_json import read_json -from src.pdf2img.Pdf2ImgClass import Pdf2ImgClass - -def pdf2img(args): - """ - main entry point - """ - - # Payload - payload_data = read_json(args.payload_dir) - - # Call class instance - pdf2img = Pdf2ImgClass( - data_path=payload_data["pdf_data_path"], - parent_path=payload_data["data_path"]) - - pdf2img.pdf2img() - - gc.collect() - -if __name__ == "__main__": - # Clean up buffer memory before starting the program - gc.collect() - - # Default values for command line arguments - # Current directory - current_dir = os.path.dirname(os.path.abspath(__file__)) - - # Payload directory - test_name = "regression_test013" - payload_name = "pdf2img_payload.json" - payload_dir = os.path.join(current_dir, "test", "regression", test_name, "payload", payload_name) - - # Set up command line argument parser - p = argparse.ArgumentParser(description="PDF to Image.") - p.add_argument("--payload_dir", type=Path, default=payload_dir, help="Data directory") - args = p.parse_args() - - pdf2img(args) \ No newline at end of file diff --git a/src/chatting/ChattingClass.py b/src/chatting/ChattingClass.py deleted file mode 100644 index e28404f..0000000 --- a/src/chatting/ChattingClass.py +++ /dev/null @@ -1,80 +0,0 @@ -import openai -import os -from dotenv import load_dotenv -from llama_index import ServiceContext, SimpleDirectoryReader, VectorStoreIndex -from llama_index.llms import OpenAI - -class ChattingClass: - def __init__(self, model_id, data_path, api_key="", temperature=0.3): - """ - Initialize the ChattingClass. - - Args: - - model_id (str): Identifier for the OpenAI model. - - data_path (str): Path to the data directory. - - api_key (str, optional): OpenAI API key. If not provided, it will be loaded from the environment variables. - - temperature (float, optional): A parameter for controlling the randomness of the model's output. - """ - self.model_id = model_id - self.data_path = data_path - self.temperature = temperature - self.set_api_key(api_key) - self.set_document(data_path) - - def set_api_key(self, api_key): - """ - Set the OpenAI API key for authorization. - - Args: - - api_key (str): OpenAI API key. - """ - if api_key: - self.api_key = api_key - else: - # Load API key from environment variables using dotenv - load_dotenv() - self.api_key = os.getenv("OPENAI_API_KEY") - - # Set the OpenAI API key in the environment and OpenAI library - if self.api_key is not None: - os.environ["OPENAI_API_KEY"] = self.api_key - openai.api_key = self.api_key - return True - else: - # In case the API key is not available, handle the situation - # This could involve logging an error, raising an exception, or providing a default value - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - openai.api_key = "openai_api_key" - return False - - def set_document(self, data_path): - """ - Load documents from the specified data directory. - - Args: - - data_path (str): Path to the data directory. - """ - self.documents = SimpleDirectoryReader( - data_path - ).load_data() - - def ask_question(self, question): - """ - Query the OpenAI model with the given question. - - Args: - - question (str): The question to ask the model. - - Returns: - - response (str): The model's response to the question. - """ - ft_context = ServiceContext.from_defaults( - llm=OpenAI(model=self.model_id, temperature=self.temperature), - context_window=2048 - ) - - index = VectorStoreIndex.from_documents(self.documents, service_context=ft_context) - query_engine = index.as_query_engine(service_context=ft_context) - - response = query_engine.query(question) - return response diff --git a/src/chatting/cl_chat_bot.py b/src/chatting/cl_chat_bot.py new file mode 100644 index 0000000..dfc8fb0 --- /dev/null +++ b/src/chatting/cl_chat_bot.py @@ -0,0 +1,106 @@ +import os +import logging +from typing import Optional +from dotenv import load_dotenv + +import openai +from llama_index import ServiceContext, SimpleDirectoryReader, VectorStoreIndex +from llama_index.llms import OpenAI + +class ChatBot: + def __init__(self, model_id, data_path, api_key="", temperature=0.3): + """ + Initialize the ChatBot class. + + Args: + - model_id (str): Identifier for the OpenAI fine tuned model ID. + - data_path (str): Path to the input data directory. + - api_key (str, optional): OpenAI API key. If not provided, it will be loaded from the environment variables. + - temperature (float, optional): A parameter for controlling the randomness of the model's output. + """ + self.model_id = model_id + self.data_path = data_path + self.temperature = temperature + self.__set_api_key(api_key) + self.__set_document(data_path) + + + def __set_api_key(self, api_key=None): + """ + Set the OpenAI API key for authorization. + + Args: + - api_key (str, optional): OpenAI API key. Default is None. + """ + # If api_key is provided and not empty + if api_key and api_key.strip(): + self.api_key = api_key + else: + # Load API key from environment variables + load_dotenv() + self.api_key = os.getenv("OPENAI_API_KEY") + + # If API key is not found in the environment variables, handle the situation + if not self.api_key: + # Here, you can log an error, raise an exception, or provide further instructions + logging.error("OpenAI API key is not provided and not found in environment variables.") + raise + + # Set the OpenAI API key in the environment and OpenAI library + os.environ["OPENAI_API_KEY"] = self.api_key + openai.api_key = self.api_key + return True + + + def __set_document(self, data_path): + """ + Load documents from the specified data directory. + + Args: + - data_path (str): Path to the input data directory. + + Raises: + - FileNotFoundError: If the specified data path does not exist or is inaccessible. + """ + try: + self.documents = SimpleDirectoryReader(data_path).load_data() + except FileNotFoundError as e: + logging.error(f"The specified data path '{data_path}' does not exist or is inaccessible: {e}") + raise + except Exception as e: + logging.error(f"An error occurred while loading data from '{data_path}': {e}") + raise + + + def ask_question(self, question: str) -> Optional[str]: + """ + Query the OpenAI model with the given question. + + Args: + question (str): The question to ask the model. + + Returns: + Optional[str]: The model's response to the question, or None if an error occurs. + """ + try: + # Initialize the service context with the specified model and temperature + ft_context = ServiceContext.from_defaults( + llm=OpenAI(model=self.model_id, temperature=self.temperature), + context_window=2048 + ) + + # Create a vector store index from the documents + # Ensure self.documents is a valid and initialized list of documents + index = VectorStoreIndex.from_documents(self.documents, service_context=ft_context) + + # Create a query engine from the index + query_engine = index.as_query_engine(service_context=ft_context) + + # Query the engine and return the response + response = query_engine.query(question) + return response + + except Exception as e: + # Handle any exceptions that occur during the query + logging.error(f"The response to the question is None because an error occurred during the query: {e}") + return None \ No newline at end of file diff --git a/src/finetune/__init__.py b/src/finetune/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/finetune/FineTuningClass.py b/src/finetune/cl_fine_tuning.py similarity index 88% rename from src/finetune/FineTuningClass.py rename to src/finetune/cl_fine_tuning.py index ad330ae..ba2024f 100644 --- a/src/finetune/FineTuningClass.py +++ b/src/finetune/cl_fine_tuning.py @@ -1,4 +1,5 @@ import os +import logging import openai import random import time @@ -17,9 +18,9 @@ from dotenv import load_dotenv -class FineTuningClass: - def __init__(self, data_path, parent_path, api_key='', model='gpt-3.5-turbo', temperature=0.3, max_retries=5): - """Initialize the FineTuningClass. +class FineTuning: + def __init__(self, data_path, parent_path, api_key="", model='gpt-3.5-turbo', temperature=0.3, max_retries=5): + """Initialize the FineTuning. Args: - data_path (str): The path to the data for fine-tuning. @@ -35,45 +36,56 @@ def __init__(self, data_path, parent_path, api_key='', model='gpt-3.5-turbo', te self.temperature = temperature self.max_retries = max_retries self.retry_delay = 60 - self.set_api_key(api_key) - self.set_document(data_path) - self.generate_subfolder(parent_path) + self.__set_api_key(api_key) + self.__set_document(data_path) + self.__generate_subfolder(parent_path) - def set_api_key(self, api_key): - if api_key: + def __set_api_key(self, api_key=None): + """ + Set the OpenAI API key for authorization. + + Args: + - api_key (str, optional): OpenAI API key. Default is None. + """ + # If api_key is provided and not empty + if api_key and api_key.strip(): self.api_key = api_key else: + # Load API key from environment variables load_dotenv() self.api_key = os.getenv("OPENAI_API_KEY") - if self.api_key is not None: - os.environ["OPENAI_API_KEY"] = self.api_key - openai.api_key = self.api_key - return True - else: - # Handle the absence of the environment variable - # You might want to log an error, raise an exception, or provide a default value - # For example, setting a default value - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - openai.api_key = "openai_api_key" - return False - - - def set_document(self, data_path): - """Load documents from the specified data directory. + # If API key is not found in the environment variables, handle the situation + if not self.api_key: + # Here, you can log an error, raise an exception, or provide further instructions + raise ValueError("OpenAI API key is not provided and not found in environment variables.") + + # Set the OpenAI API key in the environment and OpenAI library + os.environ["OPENAI_API_KEY"] = self.api_key + openai.api_key = self.api_key + return True + + + def __set_document(self, data_path): + """ + Load documents from the specified data directory. Args: - - data_path (str): Path to the data directory. + - data_path (str): Path to the input data directory. + + Raises: + - FileNotFoundError: If the specified data path does not exist or is inaccessible. """ try: self.documents = SimpleDirectoryReader(data_path).load_data() - except Exception: - # Handle the case when the data_path does not exist - print(f"The specified data path '{data_path}' does not exist or is inaccessible.") - exit() - + except FileNotFoundError as e: + logging.error(f"The specified data path '{data_path}' does not exist or is inaccessible: {e}") + raise + except Exception as e: + logging.error(f"An error occurred while loading data from '{data_path}': {e}") + raise - def generate_subfolder(self, parent_path): + def __generate_subfolder(self, parent_path): """Generate a subfolder for storing generated data. Args: diff --git a/src/finetune/finetune.py b/src/finetune/finetune.py deleted file mode 100644 index 43fbd14..0000000 --- a/src/finetune/finetune.py +++ /dev/null @@ -1,50 +0,0 @@ -import os -from dotenv import load_dotenv -import openai -import time - -# Set up the OpenAI API key from environment variables -load_dotenv() -openai_api_key = os.getenv("OPENAI_API_KEY") - -if openai_api_key is not None: - os.environ["OPENAI_API_KEY"] = openai_api_key - openai.api_key = openai_api_key -else: - # Handle the absence of the environment variable - # You might want to log an error, raise an exception, or provide a default value - # For example, setting a default value - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - openai.api_key = "openai_api_key" - -# Define the path to the data -data_path = "./test/regression/regression_test003" - -# Upload file for fine-tuning -file_upload = openai.files.create(file=open(f'{data_path}/generated_data/finetuning_events.jsonl', "rb"), purpose="fine-tune") -print("Uploaded file id", file_upload.id) - -# Check status of the file processing -while True: - print("Waiting for file to process...") - file_handle = openai.files.retrieve(file_id=file_upload.id) - if file_handle and file_handle.status == "processed": - print("File processed") - break - time.sleep(3) - -# Initiate fine-tuning job and monitor the progress -try: - job = openai.fine_tuning.jobs.create(training_file=file_upload.id, model="gpt-3.5-turbo") - - while True: - print("Waiting for fine-tuning to complete...") - job_handle = openai.fine_tuning.jobs.retrieve(fine_tuning_job_id=job.id) - if job_handle.status == "succeeded": - print("Fine-tuning complete") - print("Fine-tuned model info", job_handle) - print("Model id", job_handle.fine_tuned_model) - break - time.sleep(3) -except Exception as e: - print(f"An error occurred during fine-tuning: {e}") diff --git a/src/finetune/initital_eval.py b/src/finetune/initital_eval.py deleted file mode 100644 index e37f805..0000000 --- a/src/finetune/initital_eval.py +++ /dev/null @@ -1,70 +0,0 @@ -from llama_index import VectorStoreIndex, ServiceContext, SimpleDirectoryReader -from llama_index.llms import OpenAI - -import os -from dotenv import load_dotenv - -load_dotenv() -openai_api_key = os.getenv("OPENAI_API_KEY") - -if openai_api_key is not None: - os.environ["OPENAI_API_KEY"] = openai_api_key -else: - # Handle the absence of the environment variable - # You might want to log an error, raise an exception, or provide a default value - # For example, setting a default value - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - -data_path = "./test/regression/regression_test003" - -# Set the path to the data directory -documents = SimpleDirectoryReader( - data_path -).load_data() - -# Load questions for evaluation from a file -questions = [] -with open(f'{data_path}/generated_data/eval_questions.txt', "r") as f: - for line in f: - questions.append(line.strip()) - -# limit the context window to 2048 tokens so that refine is used -gpt_35_context = ServiceContext.from_defaults( - llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3), context_window=2048 -) - -# Create a VectorStoreIndex from the loaded documents -index = VectorStoreIndex.from_documents( - documents, service_context=gpt_35_context -) - -# Create a query engine for the index with a specified top-k similarity threshold -query_engine = index.as_query_engine(similarity_top_k=2) - -# Initialize lists to store contexts and answers for evaluation -contexts = [] -answers = [] - -# Query the index for each question and gather the contexts and responses -for question in questions: - response = query_engine.query(question) - contexts.append([x.node.get_content() for x in response.source_nodes]) - answers.append(str(response)) - -# Perform evaluation using Ragas framework -from datasets import Dataset -from ragas import evaluate -from ragas.metrics import answer_relevancy, faithfulness - -# Create a dataset from the questions, answers, and corresponding contexts -ds = Dataset.from_dict( - { - "question": questions, - "answer": answers, - "contexts": contexts, - } -) - -# Evaluate the dataset using specific metrics -result = evaluate(ds, [answer_relevancy, faithfulness]) -print(result) \ No newline at end of file diff --git a/src/finetune/jsonl_generation.py b/src/finetune/jsonl_generation.py deleted file mode 100644 index 43fe5a2..0000000 --- a/src/finetune/jsonl_generation.py +++ /dev/null @@ -1,66 +0,0 @@ -from llama_index import ServiceContext, SimpleDirectoryReader -from llama_index.llms import OpenAI -from llama_index.callbacks import OpenAIFineTuningHandler -from llama_index.callbacks import CallbackManager -import os -from dotenv import load_dotenv - -load_dotenv() -openai_api_key = os.getenv("OPENAI_API_KEY") - -if openai_api_key is not None: - os.environ["OPENAI_API_KEY"] = openai_api_key -else: - # Handle the absence of the environment variable - # You might want to log an error, raise an exception, or provide a default value - # For example, setting a default value - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - -# Define the path to the data directory -data_path = "./test/regression/regression_test003" - -# Load documents from the data directory -documents = SimpleDirectoryReader( - data_path -).load_data() - -# Initialize the OpenAIFineTuningHandler and CallbackManager -finetuning_handler = OpenAIFineTuningHandler() -callback_manager = CallbackManager([finetuning_handler]) - -# Create a ServiceContext for GPT-4 model with a limited context window and callback manager -gpt_4_context = ServiceContext.from_defaults( - llm=OpenAI(model="gpt-4", temperature=0.3), - context_window=2048, # limit the context window artifically to test refine process - callback_manager=callback_manager, -) - -# Load training questions from a file -questions = [] -with open(f'{data_path}/generated_data/train_questions.txt', "r") as f: - for line in f: - questions.append(line.strip()) - -from llama_index import VectorStoreIndex - -try: - # Create a VectorStoreIndex with the provided documents and service context - index = VectorStoreIndex.from_documents( - documents, service_context=gpt_4_context - ) - - # Create a query engine for the index with a specified top-k similarity threshold - query_engine = index.as_query_engine(similarity_top_k=2) - - # Query the index for each training question to initiate fine-tuning - for question in questions: - response = query_engine.query(question) - -# Handle any occurring exceptions -except Exception as e: - # Handle the exception here, you might want to log the error or take appropriate action - print(f"An error occurred: {e}") - -# Finally, save the fine-tuning events to a JSONL file -finally: - finetuning_handler.save_finetuning_events(f'{data_path}/generated_data/finetuning_events.jsonl') diff --git a/src/finetune/train_generation.py b/src/finetune/train_generation.py deleted file mode 100644 index 87deba3..0000000 --- a/src/finetune/train_generation.py +++ /dev/null @@ -1,96 +0,0 @@ -import random -from llama_index import SimpleDirectoryReader, ServiceContext -from llama_index.llms import OpenAI -from llama_index.evaluation import DatasetGenerator -from dotenv import load_dotenv -import os -import time -import logging -from itertools import cycle - -# Load the OpenAI API key from environment variables -load_dotenv() -openai_api_key = os.getenv("OPENAI_API_KEY") - -if openai_api_key is not None: - os.environ["OPENAI_API_KEY"] = openai_api_key -else: - os.environ["OPENAI_API_KEY"] = "your_default_api_key" - -# Define the path to the data directory -data_path = "./test/regression/regression_test005" - -# Maximum number of retries for the question generation process -max_retries = 5 - -# Time delay between retries (in seconds) -retry_delay = 60 # in seconds - -# Configure logging -logging.basicConfig(level=logging.INFO) -logger = logging.getLogger(__name__) - -# Attempt question generation with retries -for attempt in range(1, max_retries + 1): - try: - # Load documents from the specified data directory - documents = SimpleDirectoryReader(data_path).load_data() - - # Split the documents into two halves for question generation - half_point = len(documents) // 2 # Get the index for the halfway point of the documents - random.seed(42) - random.shuffle(documents) - - # Create a ServiceContext for the GPT-3.5 model - gpt_35_context = ServiceContext.from_defaults( - llm=OpenAI(model="gpt-3.5-turbo", temperature=0.3) - ) - - # Query for question generation - question_gen_query = ( - "You are a Teacher/ Professor. Your task is to setup " - "a quiz/examination. Using the provided context, formulate " - "a single question that captures an important fact from the " - "context. Restrict the question to the context information provided." - ) - - def generate_and_save_questions(documents, output_file, num_questions): - # Initialize the DatasetGenerator - dataset_generator = DatasetGenerator.from_documents( - documents, - question_gen_query=question_gen_query, - service_context=gpt_35_context - ) - questions = [] - - # Create an iterator that cycles through available documents - documents_cycle = cycle(documents) - - # Generate questions until reaching the desired count - while len(questions) < num_questions: - # Use the next document in the cycle - next_document = next(documents_cycle) - dataset_generator = dataset_generator.from_documents([next_document]) - - # Generate questions from the updated dataset - new_questions = dataset_generator.generate_questions_from_nodes(num=num_questions - len(questions)) - questions.extend(new_questions) - - logger.info(f"Generated {len(questions)} questions") - - # Save the generated questions to the output file - with open(output_file, "w") as f: - for question in questions: - f.write(question + "\n") - - # Generate and save training and evaluation questions - generate_and_save_questions(documents[:half_point], f'{data_path}/generated_data/train_questions.txt', 40) - generate_and_save_questions(documents[half_point:], f'{data_path}/generated_data/eval_questions.txt', 40) - - # If successful, break out of the retry loop - break - except Exception as e: - logger.error(f"Error in attempt {attempt}: {e}") - time.sleep(retry_delay * attempt) - -logger.info("Question generation process completed.") \ No newline at end of file diff --git a/src/main.py b/src/main.py index 91e0bbc..8e52cc0 100644 --- a/src/main.py +++ b/src/main.py @@ -59,7 +59,7 @@ async def root(): return RedirectResponse(app.docs_url) # Function to get the payload directory -def get_payload_dir(data_id: str): +def __get_payload_dir(data_id: str): """Get the directory path for the payload file.""" payload_name = "payload.json" if data_id == "": @@ -73,7 +73,7 @@ def get_payload_dir(data_id: str): @app.post("/total") async def total(request_body: MainModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" @@ -121,7 +121,7 @@ async def total(request_body: MainModel): @app.post("/finetuning") async def finetuning(request_body: BasicModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" @@ -146,7 +146,7 @@ async def finetuning(request_body: BasicModel): @app.post("/create_api") async def create_api(request_body: CreateAPIModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" @@ -167,7 +167,7 @@ async def create_api(request_body: CreateAPIModel): @app.post("/delete_api") async def delete_api(request_body: BasicModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" @@ -192,7 +192,7 @@ async def delete_api(request_body: BasicModel): @app.post("/check_api") async def check_api(request_body: BasicModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" @@ -217,7 +217,7 @@ async def check_api(request_body: BasicModel): @app.post("/conversation") async def conversation(request_body: ChattingModel): - payload_dir = get_payload_dir(request_body.data_id) + payload_dir = __get_payload_dir(request_body.data_id) if request_body.user == "": user = "user@gmail.com" diff --git a/src/mathpix/Mathpix.py b/src/mathpix/cl_mathpix.py similarity index 96% rename from src/mathpix/Mathpix.py rename to src/mathpix/cl_mathpix.py index 602fe16..679ef58 100644 --- a/src/mathpix/Mathpix.py +++ b/src/mathpix/cl_mathpix.py @@ -15,9 +15,9 @@ def __init__(self, mathpix_app_id, mathpix_app_key): """ self.mathpix_app_id = mathpix_app_id self.mathpix_app_key = mathpix_app_key - self.set_config(mathpix_app_id, mathpix_app_key) + self.__set_config(mathpix_app_id, mathpix_app_key) - def set_config(self, mathpix_app_id, mathpix_app_key): + def __set_config(self, mathpix_app_id, mathpix_app_key): """ Set Mathpix app ID and app key in the environment variables. diff --git a/src/mongodb/MongoDBClass.py b/src/mongodb/cl_mongodb.py similarity index 96% rename from src/mongodb/MongoDBClass.py rename to src/mongodb/cl_mongodb.py index 7979a27..283cca3 100644 --- a/src/mongodb/MongoDBClass.py +++ b/src/mongodb/cl_mongodb.py @@ -6,9 +6,9 @@ from src.models.api_model import APIModel -class MongoDBClass(): +class MongoDB(): def __init__(self, db_name, collection_name, mongo_uri=""): - """Initialize the MongoDBClass with the database and collection names. + """Initialize the MongoDB with the database and collection names. Args: - db_name (str): Name of the MongoDB database. @@ -17,9 +17,9 @@ def __init__(self, db_name, collection_name, mongo_uri=""): """ self.db_name = db_name self.collection_name = collection_name - self.set_mongo_uri() + self.__set_mongo_uri() - def set_mongo_uri(self): + def __set_mongo_uri(self): """Set the MongoDB connection URI using environment variables.""" load_dotenv() mongodb_username = os.getenv("MONGODB_USERNAME") @@ -35,7 +35,7 @@ def set_mongo_uri(self): self.mongo_uri = mongo_uri - def mongo_connect(self): + def __mongo_connect(self): """Connect to the MongoDB server and database, and get the specified collection.""" try: # mongo config @@ -78,7 +78,7 @@ def mongo_connect(self): print("An error occurred:", e) return {"result": False, "message": "An error occurred: " + str(e)} - # def mongo_connect(self): + # def __mongo_connect(self): # # mongo config # client = MongoClient(self.mongo_uri) # if client is None: @@ -125,7 +125,7 @@ def mongo_connect(self): def create_api(self, data:APIModel): """Create a new API document in the collection.""" # Connect to MongoDB and get the collection - db = self.mongo_connect() + db = self.__mongo_connect() if db["result"] == True: collection = db['message'] @@ -146,7 +146,7 @@ def create_api(self, data:APIModel): def delete_api(self, api_key, user): """Soft delete an API document based on the API key and user.""" # Connect to MongoDB and get the collection - db = self.mongo_connect() + db = self.__mongo_connect() if db["result"] == True: collection = db['message'] @@ -170,7 +170,7 @@ def delete_api(self, api_key, user): def check_validation_api(self, api_key, user): """Check if an API document exists and is not soft deleted.""" # Connect to MongoDB and get the collection - db = self.mongo_connect() + db = self.__mongo_connect() if db["result"] == True: collection = db['message'] diff --git a/src/pdf2img/Pdf2ImgClass.py b/src/pdf2img/cl_pdf_to_image.py similarity index 91% rename from src/pdf2img/Pdf2ImgClass.py rename to src/pdf2img/cl_pdf_to_image.py index aa29e77..67a49bd 100644 --- a/src/pdf2img/Pdf2ImgClass.py +++ b/src/pdf2img/cl_pdf_to_image.py @@ -4,9 +4,9 @@ import shutil import os -class Pdf2ImgClass: +class Pdf2Img: def __init__(self, data_path, parent_path): - """Initialize Pdf2ImgClass with data and parent paths. + """Initialize Pdf2Img with data and parent paths. Args: - data_path (str): The path where the PDF files are located. @@ -15,7 +15,7 @@ def __init__(self, data_path, parent_path): self.data_path = data_path self.parent_path = parent_path - def get_poppler_path(self): + def __get_poppler_path(self): """Retrieve the path to the directory containing pdftoppm executable. Returns: @@ -27,7 +27,7 @@ def get_poppler_path(self): else: return None - def get_pdf_list(self): + def __get_pdf_list(self): """Retrieve the list of PDF files in the data directory. Returns: @@ -49,14 +49,14 @@ def get_pdf_list(self): def pdf2img(self): """Convert each PDF file in the list to a series of images.""" - pdf_list = self.get_pdf_list() + pdf_list = self.__get_pdf_list() for index, pdf_path in enumerate(pdf_list): current_time = datetime.now().strftime('%y_%m_%d_%H_%M_%S') result_path = os.path.join(self.parent_path, "images") os.makedirs(result_path, exist_ok=True) # This line will create the directory if it doesn't exist - poppler_path = self.get_poppler_path() + poppler_path = self.__get_poppler_path() print("poppler_path", poppler_path) try: diff --git a/src/utils/chatting.py b/src/utils/chatting.py index 5f21500..46de246 100644 --- a/src/utils/chatting.py +++ b/src/utils/chatting.py @@ -2,8 +2,8 @@ from pathlib import Path from src.utils.read_json import read_json -from src.chatting.ChattingClass import ChattingClass -from src.mongodb.MongoDBClass import MongoDBClass +from src.chatting.cl_chat_bot import ChatBot +from src.mongodb.cl_mongodb import MongoDB def chatting(args): """ @@ -19,7 +19,7 @@ def chatting(args): mongo_uri = payload_data["mongo_uri"] # Create an instance of MongoDB connection - mongodb = MongoDBClass( + mongodb = MongoDB( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], mongo_uri=mongo_uri) @@ -29,14 +29,14 @@ def chatting(args): if is_available: print("valid api key") - # Create an instance of ChattingClass - chatting = ChattingClass( + # Create an instance of ChatBot + chatting = ChatBot( data_path=payload_data["data_path"], api_key=payload_data["api_key"], model_id=payload_data["model_id"], temperature=payload_data["temperature"]) - # Ask a question using the ChattingClass instance + # Ask a question using the ChatBot instance response = chatting.ask_question(args['question']) print(response) @@ -52,4 +52,4 @@ def chatting(args): gc.collect() # Return response for invalid API key - return {"status": "success", "fine_tuned_model": "invalide api key"} + return {"status": "success", "fine_tuned_model": "invalide api key"} \ No newline at end of file diff --git a/src/utils/check_api.py b/src/utils/check_api.py index a5aaf73..9107e41 100644 --- a/src/utils/check_api.py +++ b/src/utils/check_api.py @@ -2,7 +2,7 @@ from pathlib import Path from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass +from src.mongodb.cl_mongodb import MongoDB def check_api_key(args): @@ -19,7 +19,7 @@ def check_api_key(args): mongo_uri = payload_data["mongo_uri"] # Create an instance of MongoDB connection - mongodb = MongoDBClass( + mongodb = MongoDB( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], mongo_uri=mongo_uri) diff --git a/src/utils/create_api.py b/src/utils/create_api.py index e5597bb..40d813e 100644 --- a/src/utils/create_api.py +++ b/src/utils/create_api.py @@ -3,7 +3,7 @@ from datetime import datetime from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass +from src.mongodb.cl_mongodb import MongoDB from src.utils.utils_funcs import generate_api_key from src.models.api_model import APIModel @@ -21,7 +21,7 @@ def create_api_key(args): mongo_uri = payload_data["mongo_uri"] # Create an instance of MongoDB connection - mongodb = MongoDBClass( + mongodb = MongoDB( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], mongo_uri=mongo_uri) diff --git a/src/utils/delete_api.py b/src/utils/delete_api.py index 16d6977..f655229 100644 --- a/src/utils/delete_api.py +++ b/src/utils/delete_api.py @@ -2,7 +2,7 @@ from pathlib import Path from src.utils.read_json import read_json -from src.mongodb.MongoDBClass import MongoDBClass +from src.mongodb.cl_mongodb import MongoDB def delete_api_key(args): """ @@ -18,7 +18,7 @@ def delete_api_key(args): mongo_uri = payload_data["mongo_uri"] # Create an instance of MongoDB connection - mongodb = MongoDBClass( + mongodb = MongoDB( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], mongo_uri=mongo_uri) diff --git a/src/utils/total_process.py b/src/utils/total_process.py index 1af9c2c..802b10a 100644 --- a/src/utils/total_process.py +++ b/src/utils/total_process.py @@ -10,10 +10,10 @@ from src.utils.read_json import read_json from src.utils.image_translator import ImageTranslator from src.utils.chatgpt_communicator import ChatGPTCommunicator -from src.pdf2img.Pdf2ImgClass import Pdf2ImgClass -from src.finetune.FineTuningClass import FineTuningClass -from src.mathpix.Mathpix import Mathpix -from src.mongodb.MongoDBClass import MongoDBClass +from src.pdf2img.cl_pdf_to_image import Pdf2Img +from src.finetune.cl_fine_tuning import FineTuning +from src.mathpix.cl_mathpix import Mathpix +from src.mongodb.cl_mongodb import MongoDB from src.utils.utils_funcs import is_image_file, is_pdf_file, is_text_file, copy_file_to_folder, get_image_pages_percentage @@ -26,7 +26,7 @@ def total_process(args): mongo_uri = payload_data["mongo_uri"] # Call class instance - mongodb = MongoDBClass( + mongodb = MongoDB( db_name=payload_data["db_name"], collection_name=payload_data["collection_name"], mongo_uri=mongo_uri) @@ -39,7 +39,7 @@ def total_process(args): separate_data(payload_data["data_path"], payload_data["threshold_image_percent_of_pdf"]) # pdf to image feature - pdf2img = Pdf2ImgClass( + pdf2img = Pdf2Img( data_path=payload_data["pdf_data_path"], parent_path=payload_data["data_path"]) @@ -121,7 +121,7 @@ def total_process(args): save_to_txt(payload_data, json.loads(json.dumps(result, indent=4, sort_keys=True))["text"]) # fine tuning - fine_tune = FineTuningClass( + fine_tune = FineTuning( data_path=payload_data["train_data_path"], parent_path=payload_data["data_path"], api_key=payload_data["api_key"],