Skip to content

Commit

Permalink
V1
Browse files Browse the repository at this point in the history
optimized API Call by using system message,
renaming of folders
  • Loading branch information
ptmrio committed Apr 17, 2023
1 parent be61d62 commit 8173f10
Show file tree
Hide file tree
Showing 4 changed files with 182 additions and 25 deletions.
2 changes: 2 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
OPENAI_API_KEY=your_api_key_here
MY_COMPANY_NAME=your_company_name_here
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
.env
52 changes: 52 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
AIAutoRename
==========

AIAutoRename is a Python script that automatically renames PDF files based on their content. It uses the [OpenAI GPT API](https://platform.openai.com/account/api-keys) to extract relevant information such as the document date, company name, and document type from the PDF's text.

Installation
------------

To use AIAutoRename, you'll need Python 3.6 or later. You can download it from the [official Python website](https://www.python.org/downloads/) or the Microsoft Store.

After installing Python, you can install the required packages by running the following command in your terminal:

`pip install python-dotenv pdf2image pytesseract openai dateparser`

Now clone or download this repository and navigate to the root directory of the project in your terminal.

Configuration
-------------

AIAutoRename uses environment variables to configure the OpenAI API key and the name of your company. Before running the script, you'll need to create a file named `.env` in the root directory of the project and add the following lines:


```
OPENAI_API_KEY=<your-api-key>
MY_COMPANY_NAME=<your-company-name>
```

You can obtain an API key from the [OpenAI website](https://beta.openai.com/docs/developer-quickstart/your-api-keys). The `MY_COMPANY_NAME` variable should be set to your company's name. This will let the OpenAI API know, who you are so it can decide whether to use the sender or recepient of the pdf document.

Usage
-----

### Renaming a single PDF file

To rename a single PDF file, run the following command in your terminal (cmd on Windows, terminal on Mac):

`python AIAutoRename.py path/to/invoice.pdf`

Replace `path/to/invoice.pdf` with the path to your PDF file.

### Renaming all PDF files in a folder

To rename all PDF files in a folder and its subfolders, run the following command in your terminal:

`python AIAutoRename.py path/to/folder`

Replace `path/to/folder` with the path to your folder (no trailing slash).

Contributing
------------

We welcome contributions from anyone! If you find a bug or have a feature request, please open an issue on our [GitHub repository](https://github.com/example/AIAutoRename). If you'd like to contribute code, please open a pull request with your changes.
152 changes: 127 additions & 25 deletions autorename.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import os
from dotenv import load_dotenv
import sys
from pdf2image import convert_from_path
import pytesseract
Expand All @@ -7,51 +8,152 @@
import dateparser
import re

openai.api_key =
load_dotenv()
openai.api_key = os.getenv("OPENAI_API_KEY")
my_company_name = os.getenv("MY_COMPANY_NAME")


def pdf_to_text(pdf_path):
images = convert_from_path(pdf_path)
images = convert_from_path(pdf_path, first_page=1, last_page=1)
text = ''
for image in images:
text += pytesseract.image_to_string(image)
return text


def truncate_text(text, max_tokens=2048):
tokens = text.split()
truncated_text = ' '.join(tokens[:max_tokens])
return truncated_text


def is_valid_filename(filename: str) -> bool:
forbidden_characters = r'[<>:"/\\|?*]'
return not re.search(forbidden_characters, filename)


def get_openai_response(text):
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": f"Extract the company name, date, and document type from the following text:\n\n{text}"},
{"role": "user", "content": "Return only a JSON object with these properties only `company_name`, `document_date`, `document_type`. No additional text."},
{"role": "user", "content": "Do not include the legal type in the company name. Typical legal types are: AG, GmbH, e.U., KG, OG, etc. Always strip those from the company name."},
{"role": "user", "content": "If the text language is German, assume a European date format (dd.mm.YYYY or dd/mm/YYYY or reverse) in the text. Return format: dd.mm.YYYY"},
{"role": "user", "content": "Valid document types are: For Invoices use the term 'ER' only, nothing more. For all other documents, find a short descriptive summary/subject in german language."},
]
)
response = completion.choices[0].message["content"]
print(response)
return json.loads(response)
max_attempts = 3
attempt = 0

while attempt < max_attempts:
print(f'Attempt {attempt+1}/{max_attempts}')
print('---------------------------------')
print('PDF text (preview):')
print({text[:1000]})
print('---------------------------------')

completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{
"role": "system",
"content":
"You will be asked to extract the company name, document date, and document type from a PDF document." +
"Due to the nature of OCR, the text will be very noisy and might contain spelling errors, handle those as good as possible." +
"You will only return a JSON object with these properties only \"company_name\", \"document_date\", \"document_type\"." +
"No additional text and no formatting. Only the JSON object." +
"If the text language is German, assume a European date format (dd.mm.YYYY or dd/mm/YYYY or reverse) in the text. Return format: dd.mm.YYYY" +
"Valid document types are: For Invoices use the term 'ER' only, nothing more. For all other documents, find a short descriptive summary/subject in german language." +
"Here are three example responses for training purpose only:" +
"{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"ER\"} " +
"{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"Einzahlungsbestätigung\"} " +
"{\"company_name\": \"ACME\", \"document_date\": \"01.01.2021\", \"document_type\": \"Angebot\"}"
},
{"role": "user", "content": f"Extract the \"company_name\", \"document_date\", \"document_type\" from this PDF document and reply with a JSON object:\n\n{text}"},
]
)

response = completion.choices[0].message["content"]

print('---------------------------------')
print('API Response:')
print(response)
print('---------------------------------')

try:
json_response = json.loads(response)
if ('company_name' in json_response and 'document_date' in json_response and 'document_type' in json_response):
company_name = json_response['company_name']
document_date = json_response['document_date']
document_type = json_response['document_type']

date = dateparser.parse(document_date, settings={
'DATE_ORDER': 'DMY'
})

if (is_valid_filename(company_name) and is_valid_filename(document_type) and date):
break

except json.JSONDecodeError:
pass

attempt += 1

if attempt == max_attempts:
return {"company_name": "Unknown", "document_date": "00000000", "document_type": "Unknown"}

return json_response


def parse_openai_response(response):
company_name = response.get('company_name', 'Unknown')
date = dateparser.parse(response.get('document_date', '00000000'), settings={'DATE_ORDER': 'DMY'})
date = dateparser.parse(response.get(
'document_date', '00000000'), settings={'DATE_ORDER': 'DMY'})
document_type = response.get('document_type', 'Unknown')

return company_name, date, document_type


def rename_invoice(pdf_path, company_name, date, document_type):
new_name = f'{date.strftime("%Y%m%d")} {company_name} {document_type}.pdf'
base_name = f'{date.strftime("%Y%m%d")} {company_name} {document_type}'
counter = 0
new_name = base_name + '.pdf'
new_path = os.path.join(os.path.dirname(pdf_path), new_name)
os.rename(pdf_path, new_path)
print(f'Invoice renamed to: {new_name}')

if pdf_path == new_path:
print(f'File "{new_name}" is already correctly named.')
return

while os.path.exists(new_path):
counter += 1
new_name = f'{base_name} ({counter}).pdf'
new_path = os.path.join(os.path.dirname(pdf_path), new_name)

try:
os.rename(pdf_path, new_path)
print(f'Invoice renamed to: {new_name}')
except Exception as e:
print(f'Error renaming {pdf_path}: {str(e)}')


def process_folder(folder_path):
for root, _, files in os.walk(folder_path):
for file in files:
if file.lower().endswith(".pdf"):
pdf_path = os.path.join(root, file)
text = pdf_to_text(pdf_path)
openai_response = get_openai_response(text)
company_name, date, document_type = parse_openai_response(
openai_response)
rename_invoice(pdf_path, company_name, date, document_type)


if __name__ == '__main__':
if len(sys.argv) < 2:
print('Usage: python autorename_invoices.py <path_to_invoice.pdf>')
print('Usage: python autorename.py <path_to_invoice.pdf> or <path_to_folder>')
sys.exit(1)

pdf_path = sys.argv[1]
text = pdf_to_text(pdf_path)
openai_response = get_openai_response(text)
company_name, date, document_type = parse_openai_response(openai_response)
rename_invoice(pdf_path, company_name, date, document_type)
input_path = sys.argv[1]

if os.path.isfile(input_path) and input_path.lower().endswith('.pdf'):
text = pdf_to_text(input_path)
openai_response = get_openai_response(text)
company_name, date, document_type = parse_openai_response(
openai_response)
rename_invoice(input_path, company_name, date, document_type)
elif os.path.isdir(input_path):
process_folder(input_path)
else:
print('Invalid input. Please provide a path to a PDF file or a folder.')
sys.exit(1)

0 comments on commit 8173f10

Please sign in to comment.