Skip to content

Commit

Permalink
initial rough implementation of LLM integration
Browse files Browse the repository at this point in the history
  • Loading branch information
osma committed Jul 31, 2024
1 parent 0b47462 commit 3617d99
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 4 deletions.
1 change: 1 addition & 0 deletions metadata_extract/candidate.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ class Origin(Enum):
COPYRIGHT = 5
RAPPORT_PREFIX = 6
LANGUAGE_MODEL = 7
LLM = 8


class OriginType(TypedDict):
Expand Down
110 changes: 110 additions & 0 deletions metadata_extract/llm_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
"""The LLM extractor module extracts metadata using an external LLM API service."""

import json
import os
import requests
from .candidate import AuthorType, Candidate, Origin
from .metadata import Metadata
from .meteor_document import MeteorDocument


class LLMExtractor:
"""A LLMExtractor object loads a MeteorDocument and fills a Metadata object
by performing a call to an external LLM API service."""

MODEL_NAME = "xxx" # doesn't matter if running llama.cpp server
SYSTEM_PROMPT = "You are a skilled librarian specialized in meticulous " + \
"cataloguing of digital documents."
INSTRUCTION = "Extract metadata from this document. Return as JSON."
MAX_TOKENS = 1024
TEMPERATURE = 0.0
TIMEOUT = 30

def __init__(self, doc: MeteorDocument):
self._doc = doc
self.metadata = Metadata()

@classmethod
def is_available(cls) -> bool:
return cls._api_url() is not None

def extract_metadata(self) -> None:
doc_json = self._doc.extract_text_as_json()
response = self._llm_request(doc_json)
self._parse_response_to_doc(response)

def _llm_request(self, doc_json: str) -> str:
message = f"{self.INSTRUCTION}\n\n{doc_json}"

headers = {
"Content-Type": "application/json",
}

data = {
"model": self.MODEL_NAME,
"messages": [
{"role": "system", "content": self.SYSTEM_PROMPT},
{"role": "user", "content": message},
],
"temperature": self.TEMPERATURE,
"max_tokens": self.MAX_TOKENS
}

api_response = requests.post(str(self._api_url()),
headers=headers,
json=data,
timeout=self.TIMEOUT)

api_response.raise_for_status()
return str(api_response.json()['choices'][0]['message']['content'])

def _parse_response_to_doc(self, response: str) -> None:
metadata = json.loads(response)

# language
if 'language' in metadata:
self.metadata.add_candidate('language', Candidate(metadata['language'], Origin.LLM))

# title
if 'title' in metadata:
self.metadata.add_candidate('title', Candidate(metadata['title'], Origin.LLM))

# creator
if 'creator' in metadata:
for creator in metadata['creator']:
lastname, firstname = creator.split(', ', maxsplit=1)
author_dict: AuthorType = {"firstname": firstname, "lastname": lastname}
self.metadata.add_candidate('author', Candidate(author_dict, Origin.LLM))

# year
if 'year' in metadata:
self.metadata.add_candidate('year', Candidate(metadata['year'], Origin.LLM))

# publisher
if 'publisher' in metadata:
for publisher in metadata['publisher']:
# FIXME should we look up publisher in registry like Finder does?
self.metadata.add_candidate('publisher', Candidate(publisher, Origin.LLM))

# doi - not supported by Meteor

# e-isbn
if 'e-isbn' in metadata:
# This is pretty poor, we just pass the found e-ISBNs (almost never more than one)
# to Meteor directly and let it pick one essentially at random
for e_isbn in metadata['e-isbn']:
self.metadata.add_candidate('ISBN', Candidate(e_isbn, Origin.LLM))

# p-isbn - Meteor isn't interested in printed ISBNs

# e-issn
if 'e-issn' in metadata:
self.metadata.add_candidate('ISSN', Candidate(metadata['e-issn'], Origin.LLM))

# p-issn - Meteor isn't interested in printed ISBNs

# type_coar - not supported by Meteor

@classmethod
def _api_url(cls) -> str | None:
return os.environ.get('LLM_API_URL')
13 changes: 9 additions & 4 deletions metadata_extract/meteor.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .meteor_document import MeteorDocument
from .metadata import Results
from .finder import Finder
from .llm_extractor import LLMExtractor


class Meteor:
Expand Down Expand Up @@ -43,7 +44,11 @@ def set_language_detection_method(self, detect_language: Callable[[str], str]) -

def run(self, file_path: str) -> Results:
with MeteorDocument(file_path) as doc:
finder = Finder(doc, self.registry, self.detect_language)
finder.extract_metadata()
finder.metadata.choose_best()
return finder.metadata.results
extractor: Optional[LLMExtractor | Finder] = None
if LLMExtractor.is_available():
extractor = LLMExtractor(doc)
else:
extractor = Finder(doc, self.registry, self.detect_language)
extractor.extract_metadata()
extractor.metadata.choose_best()
return extractor.metadata.results
54 changes: 54 additions & 0 deletions metadata_extract/meteor_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,13 @@
"""


import json
from pathlib import Path
import re
from types import TracebackType
from typing import Optional, Self, Type
import fitz
import regex
from .page import Page
from .alto_utils import AltoFile

Expand All @@ -19,6 +22,13 @@ class MeteorDocument:
content. MeteorDocuments are context managers, so they can be used in `with` statements.
"""

# text extraction settings for LLM
PAGES = [0, 1, 2, 3, 4, 5, 6, 7, -2, -1] # pages to analyze: first 8 pages + last 2 pages
THRESHOLD = 100 # paragraphs shorter than this will always be kept
LONG_PARA_PAGES = [0, 1] # on first two pages, some long paragraphs are accepted
LONG_PARA_MAX = 2 # how many long paragraphs to keep on the first two pages
PDF_METADATA_SKIP = {'format', 'creator', 'producer'} # PDF metadata fields not to include

def __init__(self, file_path: str,
start: int = 5,
end: int = 5):
Expand Down Expand Up @@ -92,3 +102,47 @@ def get_page_object(self, page_number: int) -> Page:
raise ValueError('No PDF file to load page from')
self.page_objects[page_number] = Page(pdf_page=self.pdfdoc.load_page(page_number - 1))
return self.page_objects[page_number]

def extract_text_as_json(self) -> str:
"""Extract text and metadata as a JSON string suitable for a LLM"""

if not self.pdfdoc:
raise ValueError('No PDF document set')

pdfinfo = {}
pages = []

for key in self.pdfdoc.metadata.keys():
if key not in self.PDF_METADATA_SKIP and self.pdfdoc.metadata.get(key):
pdfinfo[key] = self.pdfdoc.metadata.get(key)

for page in self.PAGES:
if page > len(self.pdfdoc) - 2:
continue

texts = []
text = self.pdfdoc[page].get_text(sort=True)
# Use regular expression to split text into paragraphs
# Delimiter: newline(s) followed by an upper case character
paragraphs = regex.split(r'\n+(?=\p{Lu})', text, flags=re.UNICODE)
long_paragraph_count = 0

for paragraph in paragraphs:
paragraph = " ".join(paragraph.strip().split())

if '.....' in paragraph or '. . . . .' in paragraph:
# looks like a ToC entry, skip it
continue
if len(paragraph) < self.THRESHOLD: # short paragraph, keep it
texts.append(paragraph)
elif page in self.LONG_PARA_PAGES and long_paragraph_count < self.LONG_PARA_MAX:
# allow some long paragraphs on the first two pages
long_paragraph_count += 1
texts.append(paragraph)
else: # must be a long paragraph, skip it
pass
text = '\n'.join(texts)
if text:
pages.append({"page": self.pdfdoc[page].number, "text": text})

return json.dumps({"pdfinfo": pdfinfo, "pages": pages})

0 comments on commit 3617d99

Please sign in to comment.