From 8173f10dddb3f73321bd649cf0af19a262fd0dc9 Mon Sep 17 00:00:00 2001 From: ptmr Date: Mon, 17 Apr 2023 11:12:12 +0200 Subject: [PATCH] V1 optimized API Call by using system message, renaming of folders --- .env.example | 2 + .gitignore | 1 + README.md | 52 +++++++++++++++++ autorename.py | 152 +++++++++++++++++++++++++++++++++++++++++--------- 4 files changed, 182 insertions(+), 25 deletions(-) create mode 100644 .env.example create mode 100644 .gitignore create mode 100644 README.md diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..20bd6c7 --- /dev/null +++ b/.env.example @@ -0,0 +1,2 @@ +OPENAI_API_KEY=your_api_key_here +MY_COMPANY_NAME=your_company_name_here \ No newline at end of file diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..4c49bd7 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/README.md b/README.md new file mode 100644 index 0000000..0377c7b --- /dev/null +++ b/README.md @@ -0,0 +1,52 @@ +AIAutoRename +========== + +AIAutoRename is a Python script that automatically renames PDF files based on their content. It uses the [OpenAI GPT API](https://platform.openai.com/account/api-keys) to extract relevant information such as the document date, company name, and document type from the PDF's text. + +Installation +------------ + +To use AIAutoRename, you'll need Python 3.6 or later. You can download it from the [official Python website](https://www.python.org/downloads/) or the Microsoft Store. + +After installing Python, you can install the required packages by running the following command in your terminal: + +`pip install python-dotenv pdf2image pytesseract openai dateparser` + +Now clone or download this repository and navigate to the root directory of the project in your terminal. + +Configuration +------------- + +AIAutoRename uses environment variables to configure the OpenAI API key and the name of your company. Before running the script, you'll need to create a file named `.env` in the root directory of the project and add the following lines: + + +``` +OPENAI_API_KEY= +MY_COMPANY_NAME= +``` + +You can obtain an API key from the [OpenAI website](https://beta.openai.com/docs/developer-quickstart/your-api-keys). The `MY_COMPANY_NAME` variable should be set to your company's name. This will let the OpenAI API know, who you are so it can decide whether to use the sender or recepient of the pdf document. + +Usage +----- + +### Renaming a single PDF file + +To rename a single PDF file, run the following command in your terminal (cmd on Windows, terminal on Mac): + +`python AIAutoRename.py path/to/invoice.pdf` + +Replace `path/to/invoice.pdf` with the path to your PDF file. + +### Renaming all PDF files in a folder + +To rename all PDF files in a folder and its subfolders, run the following command in your terminal: + +`python AIAutoRename.py path/to/folder` + +Replace `path/to/folder` with the path to your folder (no trailing slash). + +Contributing +------------ + +We welcome contributions from anyone! If you find a bug or have a feature request, please open an issue on our [GitHub repository](https://github.com/example/AIAutoRename). If you'd like to contribute code, please open a pull request with your changes. \ No newline at end of file diff --git a/autorename.py b/autorename.py index bf9cd6e..d09d6b0 100644 --- a/autorename.py +++ b/autorename.py @@ -1,4 +1,5 @@ import os +from dotenv import load_dotenv import sys from pdf2image import convert_from_path import pytesseract @@ -7,51 +8,152 @@ import dateparser import re -openai.api_key = +load_dotenv() +openai.api_key = os.getenv("OPENAI_API_KEY") +my_company_name = os.getenv("MY_COMPANY_NAME") + def pdf_to_text(pdf_path): - images = convert_from_path(pdf_path) + images = convert_from_path(pdf_path, first_page=1, last_page=1) text = '' for image in images: text += pytesseract.image_to_string(image) return text + +def truncate_text(text, max_tokens=2048): + tokens = text.split() + truncated_text = ' '.join(tokens[:max_tokens]) + return truncated_text + + +def is_valid_filename(filename: str) -> bool: + forbidden_characters = r'[<>:"/\\|?*]' + return not re.search(forbidden_characters, filename) + + def get_openai_response(text): - completion = openai.ChatCompletion.create( - model="gpt-3.5-turbo", - messages=[ - {"role": "user", "content": f"Extract the company name, date, and document type from the following text:\n\n{text}"}, - {"role": "user", "content": "Return only a JSON object with these properties only `company_name`, `document_date`, `document_type`. No additional text."}, - {"role": "user", "content": "Do not include the legal type in the company name. Typical legal types are: AG, GmbH, e.U., KG, OG, etc. Always strip those from the company name."}, - {"role": "user", "content": "If the text language is German, assume a European date format (dd.mm.YYYY or dd/mm/YYYY or reverse) in the text. Return format: dd.mm.YYYY"}, - {"role": "user", "content": "Valid document types are: For Invoices use the term 'ER' only, nothing more. For all other documents, find a short descriptive summary/subject in german language."}, - ] - ) - response = completion.choices[0].message["content"] - print(response) - return json.loads(response) + max_attempts = 3 + attempt = 0 + + while attempt < max_attempts: + print(f'Attempt {attempt+1}/{max_attempts}') + print('---------------------------------') + print('PDF text (preview):') + print({text[:1000]}) + print('---------------------------------') + + completion = openai.ChatCompletion.create( + model="gpt-3.5-turbo", + messages=[ + { + "role": "system", + "content": + "You will be asked to extract the company name, document date, and document type from a PDF document." + + "Due to the nature of OCR, the text will be very noisy and might contain spelling errors, handle those as good as possible." + + "You will only return a JSON object with these properties only \"company_name\", \"document_date\", \"document_type\"." + + "No additional text and no formatting. Only the JSON object." + + "If the text language is German, assume a European date format (dd.mm.YYYY or dd/mm/YYYY or reverse) in the text. Return format: dd.mm.YYYY" + + "Valid document types are: For Invoices use the term 'ER' only, nothing more. For all other documents, find a short descriptive summary/subject in german language." + + "Here are three example responses for training purpose only:" + + "{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"ER\"} " + + "{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"Einzahlungsbestätigung\"} " + + "{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"Angebot\"}" + }, + {"role": "user", "content": f"Extract the \"company_name\", \"document_date\", \"document_type\" from this PDF document and reply with a JSON object:\n\n{text}"}, + ] + ) + + response = completion.choices[0].message["content"] + + print('---------------------------------') + print('API Response:') + print(response) + print('---------------------------------') + + try: + json_response = json.loads(response) + if ('company_name' in json_response and 'document_date' in json_response and 'document_type' in json_response): + company_name = json_response['company_name'] + document_date = json_response['document_date'] + document_type = json_response['document_type'] + + date = dateparser.parse(document_date, settings={ + 'DATE_ORDER': 'DMY' + }) + + if (is_valid_filename(company_name) and is_valid_filename(document_type) and date): + break + + except json.JSONDecodeError: + pass + + attempt += 1 + + if attempt == max_attempts: + return {"company_name": "Unknown", "document_date": "00000000", "document_type": "Unknown"} + + return json_response def parse_openai_response(response): company_name = response.get('company_name', 'Unknown') - date = dateparser.parse(response.get('document_date', '00000000'), settings={'DATE_ORDER': 'DMY'}) + date = dateparser.parse(response.get( + 'document_date', '00000000'), settings={'DATE_ORDER': 'DMY'}) document_type = response.get('document_type', 'Unknown') return company_name, date, document_type + def rename_invoice(pdf_path, company_name, date, document_type): - new_name = f'{date.strftime("%Y%m%d")} {company_name} {document_type}.pdf' + base_name = f'{date.strftime("%Y%m%d")} {company_name} {document_type}' + counter = 0 + new_name = base_name + '.pdf' new_path = os.path.join(os.path.dirname(pdf_path), new_name) - os.rename(pdf_path, new_path) - print(f'Invoice renamed to: {new_name}') + + if pdf_path == new_path: + print(f'File "{new_name}" is already correctly named.') + return + + while os.path.exists(new_path): + counter += 1 + new_name = f'{base_name} ({counter}).pdf' + new_path = os.path.join(os.path.dirname(pdf_path), new_name) + + try: + os.rename(pdf_path, new_path) + print(f'Invoice renamed to: {new_name}') + except Exception as e: + print(f'Error renaming {pdf_path}: {str(e)}') + + +def process_folder(folder_path): + for root, _, files in os.walk(folder_path): + for file in files: + if file.lower().endswith(".pdf"): + pdf_path = os.path.join(root, file) + text = pdf_to_text(pdf_path) + openai_response = get_openai_response(text) + company_name, date, document_type = parse_openai_response( + openai_response) + rename_invoice(pdf_path, company_name, date, document_type) + if __name__ == '__main__': if len(sys.argv) < 2: - print('Usage: python autorename_invoices.py ') + print('Usage: python autorename.py or ') sys.exit(1) - pdf_path = sys.argv[1] - text = pdf_to_text(pdf_path) - openai_response = get_openai_response(text) - company_name, date, document_type = parse_openai_response(openai_response) - rename_invoice(pdf_path, company_name, date, document_type) + input_path = sys.argv[1] + + if os.path.isfile(input_path) and input_path.lower().endswith('.pdf'): + text = pdf_to_text(input_path) + openai_response = get_openai_response(text) + company_name, date, document_type = parse_openai_response( + openai_response) + rename_invoice(input_path, company_name, date, document_type) + elif os.path.isdir(input_path): + process_folder(input_path) + else: + print('Invalid input. Please provide a path to a PDF file or a folder.') + sys.exit(1)