From ed094388de44c2b569a221a25571f933117a7463 Mon Sep 17 00:00:00 2001 From: minimalparts Date: Fri, 12 Jul 2024 15:43:02 +0200 Subject: [PATCH] Fixed issue with removal of .txt files when pdftotext is not used. --- app/indexer/pdfparser.py | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/app/indexer/pdfparser.py b/app/indexer/pdfparser.py index 260d89d..8e989e1 100644 --- a/app/indexer/pdfparser.py +++ b/app/indexer/pdfparser.py @@ -75,7 +75,10 @@ def extract_txt(url, contributor): except Exception: print("ERROR extracting body text from pdf...") remove(local_pdf_path) - remove(local_pdf_path.replace('.pdf','.txt')) + try: + remove(local_pdf_path.replace('.pdf','.txt')) + except: + pass return title, body_str, language, snippet, cc, error if title == "": @@ -87,16 +90,25 @@ def extract_txt(url, contributor): title = "" error = "ERROR extract_html: Couldn't detect page language." remove(local_pdf_path) - remove(local_pdf_path.replace('.pdf','.txt')) + try: + remove(local_pdf_path.replace('.pdf','.txt')) + except: + pass return title, body_str, language, snippet, cc, error if language not in installed_languages: error = "ERROR extract_html: language is not supported." title = "" remove(local_pdf_path) - remove(local_pdf_path.replace('.pdf','.txt')) + try: + remove(local_pdf_path.replace('.pdf','.txt')) + except: + pass return title, body_str, language, snippet, cc, error snippet = ' '.join(body_str.split()[:snippet_length]) remove(local_pdf_path) - remove(local_pdf_path.replace('.pdf','.txt')) + try: + remove(local_pdf_path.replace('.pdf','.txt')) + except: + pass return title, body_str, language, snippet, cc, error