-
Notifications
You must be signed in to change notification settings - Fork 0
/
pdfText.py
50 lines (41 loc) · 1.38 KB
/
pdfText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import re
import fitz
from nltk.tokenize import sent_tokenize
from loggerLog import *
def readPdf(filename):
documentText = ""
with fitz.open(filename) as pdfDoc:
for page in pdfDoc:
pageText = page.get_text()
pageText = pageText.replace('\n', ' ')
documentText += " " + pageText
try:
documentText = processPdfText(documentText)
except Exception as e:
logger.error("Exception in function processPdfText in readPdf (pdfText.py): %s", e)
return documentText
def processPdfText(text):
text = re.sub('Page[0-9]+in[0-9]+', ' ', text)
text = re.sub(' [0-9]+\. ', ' ', text)
text = text.replace('•', ' ')
text = text.replace('➢', ' ')
text = text.replace('‣', ' ')
text = text.replace('›', ' ')
text = re.sub(' +', ' ', text)
textSentences = sent_tokenize(text)
textSentences = [sentence for sentence in textSentences if sentence[-1] not in ["?","!"]]
text = " ".join(textSentences)
return text
def readTxt(filename):
filename += ".txt"
data = ""
with open(filename, 'r', encoding='utf8') as f:
data = f.read()
return data
def writeTxt(filename, mode, data):
filename += ".txt"
with open(filename, mode, encoding='utf8') as f:
f.write(data)
def writePdfToTxt(filename):
pdfText = readPdf(filename)
writeTxt(filename, 'w', pdfText)