-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdetection.py
168 lines (130 loc) · 5.53 KB
/
detection.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# Builtin
import os
import time
# Third party.
import pdf2image
import pytesseract
from PIL import Image
from pypdf import PdfReader, PdfWriter
# First party.
import config
import debug
from simplelog import log
from bank.info import BankNames as Banks # Too lazy to refactor?
def ocr_image(image_object):
# For images with white text on dark backgrounds, like N26 receipt.
background = image_object.getpixel((1, 1))
threshold = config.THRESHOLD_LIGHT if background[0] > config.THRESHOLD_LIMIT else config.THRESHOLD_DARK
boosted_image = image_object.convert(mode="L").point(lambda pixel_value: 255 if pixel_value > threshold else 0)
if config.DEBUG_MODE:
complete_path = os.path.join(config.dbg_path, f"{threshold}_{config.current_file}")
boosted_image.save(f"{complete_path}.png")
log(f"OCR threshold: {threshold}.")
ocr_text = pytesseract.image_to_string(boosted_image, lang="por").split("\n")
return ocr_text
def brute_force_pwd(pdf_object, digits):
if not config.BRUTE_FORCE_PWD:
log(f"Brute forcing PDF password is disabled.")
return False
log(f"Brute forcing PDF password...")
top = int("9" * digits)
for pwd in range(0, top):
if pdf_object.decrypt(f"{pwd:0{digits}d}") != 0:
log(f"PDF decrypted with password '{pwd:0{digits}d}'.")
return True
log("Wrong password.")
return False
def decrypt_pdf(pdf_object, cpf):
if len(cpf) < 1:
log("Can't decrypt PDF. Brute forcing with 5 digits...")
if brute_force_pwd(pdf_object, digits=5):
return True
log("Can't decrypt PDF. Brute forcing with 6 digits...")
if brute_force_pwd(pdf_object, digits=6):
return True
log("Can't decrypt PDF. 😧 Asking the user...")
pwd = input("Type PDF password: ").strip()
if pdf_object.decrypt(pwd) == 0:
log("Wrong password.")
return False
else:
log("PDF decrypted.")
return True
log(f"Trying password: '{cpf}'")
if pdf_object.decrypt(cpf) == 0:
return decrypt_pdf(pdf_object, cpf[:-1])
else:
log("PDF decrypted.")
return True
class UnknownReceipt:
def __init__(self, file_path):
self.file_path = file_path
self.extracted_text = self.extract_text()
self.timestamp = self.timestamp_of_file()
self.bank_guess = self.guess_bank()
def extract_text(self, do_ocr=False):
if self.file_path.endswith(".pdf"):
log("File is a PDF.")
reader = PdfReader(self.file_path)
if reader.is_encrypted:
log("File is encrypted.")
if not decrypt_pdf(reader, config.CPF):
return None
writer = PdfWriter()
for page in reader.pages:
writer.add_page(page)
writer.write(self.file_path)
# Load decrypted file.
reader = PdfReader(self.file_path)
page = reader.pages[0]
list_of_text = [] # Useless, but PyCharm complains if this isn't here.
try: # We have to try here because some PDFs cause PdfReader to crash.
log("Reading PDF text layer...")
list_of_text = page.extract_text().split("\n")
if len(list_of_text) < 2:
log("File has no text layer, running OCR...")
do_ocr = True
except IndexError:
log("PdfReader crashed, running OCR...")
do_ocr = True
if do_ocr:
list_of_text = ocr_image(pdf2image.convert_from_path(self.file_path)[0])
else:
log("File is an image, running OCR...")
list_of_text = ocr_image(Image.open(self.file_path))
clean_list_of_text = [text for text in list_of_text if text.strip()]
debug.save_txt_to_disk(clean_list_of_text)
return clean_list_of_text
def guess_bank(self):
detected_bank = self.detect_bank()
if (detected_bank == Banks.Unknown) and (self.file_path.endswith(".pdf")):
log("Can't identify bank from text layer, forcing OCR...")
self.extracted_text = self.extract_text(True)
detected_bank = self.detect_bank()
return detected_bank
def detect_bank(self):
if len(self.extracted_text) < 2:
return Banks.Unknown
id_string = f"{self.extracted_text[0].lower()}{self.extracted_text[1].lower()}{self.extracted_text[-2]}"
log(f"String used to identify bank: '{id_string}'.")
if "NU" in id_string or "Olá" in id_string:
return Banks.Nubank
elif "mercado" in id_string: # or "Comprovante" in self.extracted_text[0]:
return Banks.MercadoPago
elif "N26" in id_string:
return Banks.N26
elif "aunterpix" in id_string:
return Banks.Inter
elif "cobank" in id_string or "c6bank" in id_string:
return Banks.C6
elif "sucesso!valor" in id_string:
return Banks.Claro
elif "@genial" in id_string:
return Banks.Genial
else:
return Banks.Unknown
def timestamp_of_file(self):
file_modification_time = os.path.getmtime(self.file_path) # In seconds since epoch.
timestamp_string = time.ctime(file_modification_time) # As a timestamp string.
time_object = time.strptime(timestamp_string) # To a timestamp object.
return time.strftime("%Y %m %d", time_object) # To my format.