initial rough implementation of LLM integration

NatLibFi · Jul 31, 2024 · 3617d99 · 3617d99
1 parent 0b47462
commit 3617d99
Show file tree

Hide file tree

Showing 4 changed files with 174 additions and 4 deletions.
diff --git a/metadata_extract/candidate.py b/metadata_extract/candidate.py
@@ -15,6 +15,7 @@ class Origin(Enum):
     COPYRIGHT = 5
     RAPPORT_PREFIX = 6
     LANGUAGE_MODEL = 7
+    LLM = 8
 
 
 class OriginType(TypedDict):

diff --git a/metadata_extract/llm_extractor.py b/metadata_extract/llm_extractor.py
@@ -0,0 +1,110 @@
+"""The LLM extractor module extracts metadata using an external LLM API service."""
+
+import json
+import os
+import requests
+from .candidate import AuthorType, Candidate, Origin
+from .metadata import Metadata
+from .meteor_document import MeteorDocument
+
+
+class LLMExtractor:
+    """A LLMExtractor object loads a MeteorDocument and fills a Metadata object
+    by performing a call to an external LLM API service."""
+
+    MODEL_NAME = "xxx"  # doesn't matter if running llama.cpp server
+    SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \
+                    "cataloguing of digital documents."
+    INSTRUCTION = "Extract metadata from this document. Return as JSON."
+    MAX_TOKENS = 1024
+    TEMPERATURE = 0.0
+    TIMEOUT = 30
+
+    def __init__(self, doc: MeteorDocument):
+        self._doc = doc
+        self.metadata = Metadata()
+
+    @classmethod
+    def is_available(cls) -> bool:
+        return cls._api_url() is not None
+
+    def extract_metadata(self) -> None:
+        doc_json = self._doc.extract_text_as_json()
+        response = self._llm_request(doc_json)
+        self._parse_response_to_doc(response)
+
+    def _llm_request(self, doc_json: str) -> str:
+        message = f"{self.INSTRUCTION}\n\n{doc_json}"
+
+        headers = {
+            "Content-Type": "application/json",
+        }
+
+        data = {
+            "model": self.MODEL_NAME,
+            "messages": [
+                {"role": "system", "content": self.SYSTEM_PROMPT},
+                {"role": "user", "content": message},
+            ],
+            "temperature": self.TEMPERATURE,
+            "max_tokens": self.MAX_TOKENS
+        }
+
+        api_response = requests.post(str(self._api_url()),
+                                     headers=headers,
+                                     json=data,
+                                     timeout=self.TIMEOUT)
+
+        api_response.raise_for_status()
+        return str(api_response.json()['choices'][0]['message']['content'])
+
+    def _parse_response_to_doc(self, response: str) -> None:
+        metadata = json.loads(response)
+
+        # language
+        if 'language' in metadata:
+            self.metadata.add_candidate('language', Candidate(metadata['language'], Origin.LLM))
+
+        # title
+        if 'title' in metadata:
+            self.metadata.add_candidate('title', Candidate(metadata['title'], Origin.LLM))
+
+        # creator
+        if 'creator' in metadata:
+            for creator in metadata['creator']:
+                lastname, firstname = creator.split(', ', maxsplit=1)
+                author_dict: AuthorType = {"firstname": firstname, "lastname": lastname}
+                self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM))
+
+        # year
+        if 'year' in metadata:
+            self.metadata.add_candidate('year', Candidate(metadata['year'], Origin.LLM))
+
+        # publisher
+        if 'publisher' in metadata:
+            for publisher in metadata['publisher']:
+                # FIXME should we look up publisher in registry like Finder does?
+                self.metadata.add_candidate('publisher', Candidate(publisher, Origin.LLM))
+
+        # doi - not supported by Meteor
+
+        # e-isbn
+        if 'e-isbn' in metadata:
+            # This is pretty poor, we just pass the found e-ISBNs (almost never more than one)
+            # to Meteor directly and let it pick one essentially at random
+            for e_isbn in metadata['e-isbn']:
+                self.metadata.add_candidate('ISBN', Candidate(e_isbn, Origin.LLM))
+
+        # p-isbn - Meteor isn't interested in printed ISBNs
+
+        # e-issn
+        if 'e-issn' in metadata:
+            self.metadata.add_candidate('ISSN', Candidate(metadata['e-issn'], Origin.LLM))
+
+        # p-issn - Meteor isn't interested in printed ISBNs
+
+        # type_coar - not supported by Meteor
+
+    @classmethod
+    def _api_url(cls) -> str | None:
+        return os.environ.get('LLM_API_URL')
diff --git a/metadata_extract/meteor.py b/metadata_extract/meteor.py
@@ -9,6 +9,7 @@
 from .meteor_document import MeteorDocument
 from .metadata import Results
 from .finder import Finder
+from .llm_extractor import LLMExtractor
 
 
 class Meteor:
@@ -43,7 +44,11 @@ def set_language_detection_method(self, detect_language: Callable[[str], str]) -
 
     def run(self, file_path: str) -> Results:
         with MeteorDocument(file_path) as doc:
-            finder = Finder(doc, self.registry, self.detect_language)
-            finder.extract_metadata()
-            finder.metadata.choose_best()
-            return finder.metadata.results
+            extractor: Optional[LLMExtractor | Finder] = None
+            if LLMExtractor.is_available():
+                extractor = LLMExtractor(doc)
+            else:
+                extractor = Finder(doc, self.registry, self.detect_language)
+            extractor.extract_metadata()
+            extractor.metadata.choose_best()
+            return extractor.metadata.results
diff --git a/metadata_extract/meteor_document.py b/metadata_extract/meteor_document.py
@@ -4,10 +4,13 @@
 """
 
 
+import json
 from pathlib import Path
+import re
 from types import TracebackType
 from typing import Optional, Self, Type
 import fitz
+import regex
 from .page import Page
 from .alto_utils import AltoFile
 
@@ -19,6 +22,13 @@ class MeteorDocument:
     content. MeteorDocuments are context managers, so they can be used in `with` statements.
     """
 
+    # text extraction settings for LLM
+    PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1]  # pages to analyze: first 8 pages + last 2 pages
+    THRESHOLD = 100                       # paragraphs shorter than this will always be kept
+    LONG_PARA_PAGES = [0, 1]         # on first two pages, some long paragraphs are accepted
+    LONG_PARA_MAX = 2                # how many long paragraphs to keep on the first two pages
+    PDF_METADATA_SKIP = {'format', 'creator', 'producer'}  # PDF metadata fields not to include
+
     def __init__(self, file_path: str,
                  start: int = 5,
                  end: int = 5):
@@ -92,3 +102,47 @@ def get_page_object(self, page_number: int) -> Page:
                 raise ValueError('No PDF file to load page from')
             self.page_objects[page_number] = Page(pdf_page=self.pdfdoc.load_page(page_number - 1))
         return self.page_objects[page_number]
+
+    def extract_text_as_json(self) -> str:
+        """Extract text and metadata as a JSON string suitable for a LLM"""
+
+        if not self.pdfdoc:
+            raise ValueError('No PDF document set')
+
+        pdfinfo = {}
+        pages = []
+
+        for key in self.pdfdoc.metadata.keys():
+            if key not in self.PDF_METADATA_SKIP and self.pdfdoc.metadata.get(key):
+                pdfinfo[key] = self.pdfdoc.metadata.get(key)
+
+        for page in self.PAGES:
+            if page > len(self.pdfdoc) - 2:
+                continue
+
+            texts = []
+            text = self.pdfdoc[page].get_text(sort=True)
+            # Use regular expression to split text into paragraphs
+            # Delimiter: newline(s) followed by an upper case character
+            paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
+            long_paragraph_count = 0
+
+            for paragraph in paragraphs:
+                paragraph = " ".join(paragraph.strip().split())
+
+                if '.....' in paragraph or '. . . . .' in paragraph:
+                    # looks like a ToC entry, skip it
+                    continue
+                if len(paragraph) < self.THRESHOLD:  # short paragraph, keep it
+                    texts.append(paragraph)
+                elif page in self.LONG_PARA_PAGES and long_paragraph_count < self.LONG_PARA_MAX:
+                    # allow some long paragraphs on the first two pages
+                    long_paragraph_count += 1
+                    texts.append(paragraph)
+                else:  # must be a long paragraph, skip it
+                    pass
+            text = '\n'.join(texts)
+            if text:
+                pages.append({"page": self.pdfdoc[page].number, "text": text})
+
+        return json.dumps({"pdfinfo": pdfinfo, "pages": pages})