-
Notifications
You must be signed in to change notification settings - Fork 0
/
conversions.py
220 lines (172 loc) · 9.17 KB
/
conversions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
import os
import zipfile
import pdfkit
from bs4 import BeautifulSoup
def update_progress(progress_var, value, max_value):
"""
Updates the progress bar variable based on the current progress.
Parameters:
- progress_var (tkinter.Variable): Tkinter variable linked to the progress bar.
- value (int): Current progress value.
- max_value (int): Total number representing the maximum progress value.
"""
progress_percentage = int((value / max_value) * 100)
progress_var.set(progress_percentage)
class ZipExtractor:
def __init__(self, input_directory, output_directory):
self.input_directory = input_directory
self.output_directory = output_directory
def extract_zipFiles(self):
"""
Extracts ONLY .html files from .zip files from given input_directory and saves them to the output+directory.
Removes successfully processed .zip files from the input_directory.
Raises:
- FileNotFoundError: If the specified .zip file is not found in the input_directory.
- zipfile.BadZipFile: If the .zip file is not a valid zip file.
- PermissionError: If the function lacks the necessary permissions to extract or delete a file.
- OSError: If an unexpected error occurs during the extraction process.
"""
files = os.listdir(self.input_directory)
total_files = len(files)
successfully_processed_zips = []
for index, zip_file in enumerate(files):
try:
with zipfile.ZipFile(os.path.join(self.input_directory, zip_file), 'r') as zip_ref:
file_list = zip_ref.namelist()
html_files = [file for file in file_list if file.endswith('.html')]
for html_file in html_files:
zip_ref.extract(html_file, self.output_directory)
successfully_processed_zips.append(zip_file)
except Exception as e:
print(f"error processing {zip_file} : {e}")
ZipExtractor.remove_zip_files(successfully_processed_zips, self.input_directory)
print("Extraction Complete!")
@staticmethod
def remove_zip_files(processed_zips, input_directory):
"""
Removes the successfully processed .zip files from the specified input_directory.
Parameters:
- processed_zips (list): List of .zip files that have been successfully processed
- input_directory (str): The directory containing the processed .zip files.
Raises:
- FileNotFoundError: If the specified .zip file is not found in the input_directory.
- PermissionError: If the function lacks the necessary permissions to delete a file.
- OSError: If an unexpected error occurs during the deletion process.
"""
for zip_file in processed_zips:
try:
os.remove(os.path.join(input_directory, zip_file))
except Exception as e:
print(f'error deleting {zip_file} : {e}')
class HtmlToPdfConverter:
def __init__(self, input_directory, output_directory):
self.input_directory = input_directory
self.output_directory = output_directory
def convert_to_pdf(self, progress_var):
"""
Converts .html files to .pdf files from input_directory and saves them into output_directory.
Parameters:
- progress_var (tkinter.Variable): Tkinter variable linked to the progress bar.
Raises:
- FileNotFoundError: If the specified .html file is not found in the input_directory.
- pdfkit.ConfigurationError: If there is an issue with the configuration of pdfkit.
- pdfkit.BlockingIOError: If there is an issue with file I/O operations during conversion.
- OSError: If an unexpected error occurs during the conversion process.
Note:
- The pdfkit module must be installed and properly configured.
- This function uses the wkhtmltopdf.exe for conversion. Make sure it is installed and
its location properly specified in config.
"""
config = pdfkit.configuration(wkhtmltopdf=r'path/to/wkhtmltopdf')
files = os.listdir(self.input_directory)
total_files = len(files)
for index, html_file in enumerate(files):
input_path = os.path.join(self.input_directory, html_file)
output_path = os.path.join(self.output_directory, os.path.splitext(html_file)[0] + '.pdf')
try:
pdfkit.from_file(input_path, output_path, configuration=config)
print(f'converted {html_file} to PDF')
update_progress(progress_var, index + 1, total_files)
except Exception as e:
print(f'error converting {html_file} to PDF: {e}')
update_progress(progress_var, index + 1, total_files)
class PdfRenamer:
def __init__(self, pdf_directory, output_directory):
self.pdf_directory = pdf_directory
self.output_directory = output_directory
def rename_PDF(self):
"""
Renames .pdf files with using real customer names extracted from the corresponding .html files.
Raises:
- FileNotFoundError: If the specified .html file is not found in the output_dir.
- PermissionError: If the function lacks the necessary permissions to rename or delete a file.
- OSError: If an unexpected error occurs during the renaming process.
"""
pdf_files = [file for file in os.listdir(self.pdf_directory) if file.endswith('.pdf')]
for pdf_file in pdf_files:
html_path = os.path.join(self.output_directory, os.path.splitext(pdf_file)[0] + '.html')
try:
name_surname = self.get_name_surname_withID(html_path)
new_file_name = self.get_name_surname(name_surname) + '.pdf'
new_PDF_path = os.path.join(self.pdf_directory, new_file_name)
if os.path.exists(new_PDF_path):
os.remove(new_PDF_path)
os.rename(os.path.join(self.pdf_directory, pdf_file), new_PDF_path)
print(f'Renamed {pdf_file} to {new_file_name}')
except Exception as e:
print(f'error renaming {pdf_file}: {e}')
def get_name_surname_withID(self, html_file_path):
"""
Extracts information (name, surname, address etc.) from the specified table of html files.
Parameters:
- html_file_path (str): The path to html files.
Returns:
str: The extracted information from customerPartyTable.
Note:
- The HTML file should contain a table with the id 'customerPartyTable'.
- The function assumes that the name is located in the first column of the table.
"""
with open(html_file_path, 'r', encoding='utf-8') as file:
html_content = file.read()
soup = BeautifulSoup(html_content, 'html.parser')
target_table = soup.find('table', {'id': 'customerPartyTable'})
if target_table:
rows = target_table.find_all('tr')
for row in rows:
cells = row.find_all('td')
if len(cells) >= 2:
name = cells[0].text.strip()
return name
def get_name_surname(self, data):
"""
Extracts name and surname information from the given data set.
Parameters:
- data (str): A data set which includes name and surname information.
Returns:
str: Concatenation of the first and second elements of the dataset.
Note:
- The function assumes that the name and surname are the first two elements in the dataset.
- It is designed to work correctly if the customer has one name and one surname.
For example, for a customer named Jack Sparrow, the return will be "Jack Sparrow".
However, for a customer named Captain Jack Sparrow, the return will be "Captain Jack".
- The function assumes the length of a customer name never exceeds 60 characters.
"""
lst = data[5:60].split()
return lst[0] + " " + lst[1]
class PDFConversionTool:
def __init__(self, input_directory, output_directory, pdf_directory):
self.input_directory = input_directory
self.output_directory = output_directory
self.pdf_directory = pdf_directory
def run_conversion(self, progress_var):
"""
Starts the conversion operations.
Parameters:
- progress_var (tkinter.Variable): Tkinter variable linked to the progress bar.
"""
extractor = ZipExtractor(self.input_directory, self.output_directory)
extractor.extract_zipFiles()
converter = HtmlToPdfConverter(self.output_directory, self.pdf_directory)
converter.convert_to_pdf(progress_var)
renamer = PdfRenamer(self.pdf_directory, self.output_directory)
renamer.rename_PDF()