Merge pull request #59 from PeARSearch/quick-pdf-indexing-fix

Fixed issue with removal of .txt files when pdftotext is not used.
PeARSearch · Jul 12, 2024 · 49ba590 · 49ba590
2 parents e1070f6 + ed09438
commit 49ba590
Showing 1 changed file with 16 additions and 4 deletions.
diff --git a/app/indexer/pdfparser.py b/app/indexer/pdfparser.py
@@ -75,7 +75,10 @@ def extract_txt(url, contributor):
     except Exception:
         print("ERROR extracting body text from pdf...")
         remove(local_pdf_path)
-        remove(local_pdf_path.replace('.pdf','.txt'))
+        try:
+            remove(local_pdf_path.replace('.pdf','.txt'))
+        except:
+            pass
         return title, body_str, language, snippet, cc, error
 
     if title == "":
@@ -87,16 +90,25 @@ def extract_txt(url, contributor):
         title = ""
         error = "ERROR extract_html: Couldn't detect page language."
         remove(local_pdf_path)
-        remove(local_pdf_path.replace('.pdf','.txt'))
+        try:
+            remove(local_pdf_path.replace('.pdf','.txt'))
+        except:
+            pass
         return title, body_str, language, snippet, cc, error
 
     if language not in installed_languages:
         error = "ERROR extract_html: language is not supported."
         title = ""
         remove(local_pdf_path)
-        remove(local_pdf_path.replace('.pdf','.txt'))
+        try:
+            remove(local_pdf_path.replace('.pdf','.txt'))
+        except:
+            pass
         return title, body_str, language, snippet, cc, error
     snippet = ' '.join(body_str.split()[:snippet_length])
     remove(local_pdf_path)
-    remove(local_pdf_path.replace('.pdf','.txt'))
+    try:
+        remove(local_pdf_path.replace('.pdf','.txt'))
+    except:
+        pass
     return title, body_str, language, snippet, cc, error