Skip to content

Commit

Permalink
Better Summary Extraction
Browse files Browse the repository at this point in the history
  • Loading branch information
BenediktHeinrichs committed Jul 3, 2024
1 parent c6259bd commit fed9aa5
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 9 deletions.
15 changes: 7 additions & 8 deletions MetadataExtractor/Extractors/Text/SummaryExtract.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,23 @@
from .ITextExtract import ITextExtract
from MetadataExtractor.Util import metadataCreation, metadataFormatter
import gensim
from gensim.summarization.textcleaner import (
clean_text_by_sentences as _clean_text_by_sentences,
)
from transformers import pipeline
import logging

log = logging.getLogger(__name__)


class SummaryExtract(ITextExtract):
def __init__(self, config):
ITextExtract.__init__(self, config)
self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

def text_extract(self, text, fileInfo):
sentences = _clean_text_by_sentences(text)
trig = ""

if len(sentences) > 1:
if isinstance(text, str) and len(text.strip()) > 30:
log.info("Executing Summary extraction.")
gensim_summary = gensim.summarization.summarize(text)

formattedSummary = gensim_summary.replace("\\", "").replace('"""', "'''")
formattedSummary = self.summarizer(text, max_length=200, min_length=30, do_sample=False)[0]["summary_text"]

trig = metadataCreation.addMetadataToFileGraph(
fileInfo,
Expand Down
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,5 @@ flowkit==1.0.0
openai-whisper
soundfile
pefile==2023.2.7
pydicom==2.4.4
pydicom==2.4.4
transformers==4.42.3

0 comments on commit fed9aa5

Please sign in to comment.