Skip to content

Commit

Permalink
Merge pull request #59 from PeARSearch/quick-pdf-indexing-fix
Browse files Browse the repository at this point in the history
Fixed issue with removal of .txt files when pdftotext is not used.
  • Loading branch information
minimalparts authored Jul 12, 2024
2 parents e1070f6 + ed09438 commit 49ba590
Showing 1 changed file with 16 additions and 4 deletions.
20 changes: 16 additions & 4 deletions app/indexer/pdfparser.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@ def extract_txt(url, contributor):
except Exception:
print("ERROR extracting body text from pdf...")
remove(local_pdf_path)
remove(local_pdf_path.replace('.pdf','.txt'))
try:
remove(local_pdf_path.replace('.pdf','.txt'))
except:
pass
return title, body_str, language, snippet, cc, error

if title == "":
Expand All @@ -87,16 +90,25 @@ def extract_txt(url, contributor):
title = ""
error = "ERROR extract_html: Couldn't detect page language."
remove(local_pdf_path)
remove(local_pdf_path.replace('.pdf','.txt'))
try:
remove(local_pdf_path.replace('.pdf','.txt'))
except:
pass
return title, body_str, language, snippet, cc, error

if language not in installed_languages:
error = "ERROR extract_html: language is not supported."
title = ""
remove(local_pdf_path)
remove(local_pdf_path.replace('.pdf','.txt'))
try:
remove(local_pdf_path.replace('.pdf','.txt'))
except:
pass
return title, body_str, language, snippet, cc, error
snippet = ' '.join(body_str.split()[:snippet_length])
remove(local_pdf_path)
remove(local_pdf_path.replace('.pdf','.txt'))
try:
remove(local_pdf_path.replace('.pdf','.txt'))
except:
pass
return title, body_str, language, snippet, cc, error

0 comments on commit 49ba590

Please sign in to comment.