diff --git a/MetadataExtractor/Extractors/Text/SummaryExtract.py b/MetadataExtractor/Extractors/Text/SummaryExtract.py index a45d690..d389c1a 100644 --- a/MetadataExtractor/Extractors/Text/SummaryExtract.py +++ b/MetadataExtractor/Extractors/Text/SummaryExtract.py @@ -1,24 +1,23 @@ from .ITextExtract import ITextExtract from MetadataExtractor.Util import metadataCreation, metadataFormatter -import gensim -from gensim.summarization.textcleaner import ( - clean_text_by_sentences as _clean_text_by_sentences, -) +from transformers import pipeline import logging log = logging.getLogger(__name__) class SummaryExtract(ITextExtract): + def __init__(self, config): + ITextExtract.__init__(self, config) + self.summarizer = pipeline("summarization", model="facebook/bart-large-cnn") + def text_extract(self, text, fileInfo): - sentences = _clean_text_by_sentences(text) trig = "" - if len(sentences) > 1: + if isinstance(text, str) and len(text.strip()) > 30: log.info("Executing Summary extraction.") - gensim_summary = gensim.summarization.summarize(text) - formattedSummary = gensim_summary.replace("\\", "").replace('"""', "'''") + formattedSummary = self.summarizer(text, max_length=200, min_length=30, do_sample=False)[0]["summary_text"] trig = metadataCreation.addMetadataToFileGraph( fileInfo, diff --git a/requirements.txt b/requirements.txt index a89eb72..011f299 100644 --- a/requirements.txt +++ b/requirements.txt @@ -55,4 +55,5 @@ flowkit==1.0.0 openai-whisper soundfile pefile==2023.2.7 -pydicom==2.4.4 \ No newline at end of file +pydicom==2.4.4 +transformers==4.42.3 \ No newline at end of file