From 2259f56dd28fa07ba14f157f57a0d403dbe878d7 Mon Sep 17 00:00:00 2001 From: Andrei Date: Tue, 21 Nov 2023 10:14:58 +0200 Subject: [PATCH] refactor(parse): :lipstick: Tidiniing up of the Outputformatter class. Removing unused attriutes and methods. Markin private methods as private. --- docs/user_guide/advanced.rst | 3 +- newspaper/article.py | 1 - newspaper/configuration.py | 8 +- newspaper/outputformatters.py | 138 ++++++++++++++-------------------- newspaper/settings.py | 29 +++++++ 5 files changed, 90 insertions(+), 89 deletions(-) diff --git a/docs/user_guide/advanced.rst b/docs/user_guide/advanced.rst index d464d3e..bbc2a83 100644 --- a/docs/user_guide/advanced.rst +++ b/docs/user_guide/advanced.rst @@ -52,8 +52,7 @@ For example, you could: # we are calling the shortcut function ``article()`` which will do the # downloading and parsing for us and return an ``Article`` object. - a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html' - , keep_article_html=True) + a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html') print(a.article_html) # '
\n

(CNN) -- Charles Smith insisted Sunda...' diff --git a/newspaper/article.py b/newspaper/article.py index cfd569b..7b19f56 100644 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -485,7 +485,6 @@ def parse(self) -> "Article": if self.config.use_meta_language: self.extractor.update_language(self.meta_lang) - output_formatter.update_language(self.meta_lang) self.meta_site_name = metadata["site_name"] self.meta_description = metadata["description"] diff --git a/newspaper/configuration.py b/newspaper/configuration.py index 280bfe7..f873fd2 100644 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -64,8 +64,10 @@ class Configuration: default True. follow_meta_refresh (bool): if True, it will follow meta refresh redirect when downloading an article. default False. - keep_article_html (bool): if True it will replace the - :any:`Article.html` property with the html of the body. + clean_article_html (bool): if True it will clean 'unnecessary' tags + from the article body html. + Affected property is :any:`Article.article_html`. + Default True. http_success_only (bool): if True, it will raise an ``ArticleException`` if the html status_code is >= 400 (e.g. 404 page) stopwords_class (obj): unique stopword classes for oriental languages, @@ -163,7 +165,7 @@ def __init__(self): self._use_meta_language = True # You may keep the html of just the main article body - self.keep_article_html = False + self.clean_article_html = True # Fail for error responses (e.g. 404 page) self.http_success_only = True diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py index 5985bd9..b4bdb15 100644 --- a/newspaper/outputformatters.py +++ b/newspaper/outputformatters.py @@ -7,93 +7,67 @@ """ from html import unescape import logging +from typing import Tuple import lxml -import newspaper.parsers as parsers -from .text import innerTrim - -CLEAN_ARTICLE_TAGS = [ - "a", - "span", - "p", - "br", - "strong", - "b", - "em", - "i", - "tt", - "code", - "pre", - "blockquote", - "img", - "h1", - "h2", - "h3", - "h4", - "h5", - "h6", - "ul", - "ol", - "li", - "dl", - "dt", - "dd", -] +from newspaper import parsers +from newspaper.configuration import Configuration +from newspaper.text import innerTrim +from newspaper import settings log = logging.getLogger(__name__) class OutputFormatter: - def __init__(self, config): - self.top_node = None - self.config = config - self.language = config.language - self.stopwords_class = config.stopwords_class - - def update_language(self, meta_lang): - """Required to be called before the extraction process in some - cases because the stopwords_class has to set in case the lang - is not latin based + """Class that converts the article top node into text, cleaning up + debris tags, replacing
with newlines, etc. + + if `config.clean_article_html` is True, then the article's html is + cleaned as well. Only `settings.CLEAN_ARTICLE_TAGS` are allowed to + remain in the html. + """ + + def __init__(self, config=None): + self.config = config or Configuration() + + def get_formatted(self, top_node: lxml.html.HtmlElement) -> Tuple[str, str]: + """Returns the body text of an article, and also the cleaned html body + article of the article. + Arguments: + top_node {lxml.html.HtmlElement} -- The top node element of the article + Returns: + Tuple[str, str] -- The body text of the article, and the cleaned + html body of the article """ - if meta_lang: - self.language = meta_lang - self.stopwords_class = self.config.get_stopwords_class(meta_lang) - - def get_top_node(self): - return self.top_node - - def get_formatted(self, top_node): - """Returns the body text of an article, and also the body article - html if specified. Returns in (text, html) form - """ - self.top_node = top_node html, text = "", "" - if self.top_node is None: + if top_node is None: return (text, html) - self.remove_negativescores_nodes() + self._remove_negativescores_nodes(top_node) - if self.config.keep_article_html: - html = self.convert_to_html() + if self.config.clean_article_html: + html = self._convert_to_html(top_node) + else: + html = parsers.node_to_string(top_node) # remove a tags from article tree. Leaves the text intact - lxml.etree.strip_tags(self.top_node, "a") + lxml.etree.strip_tags(top_node, "a") - self.add_newline_to_br() - self.add_newline_to_li() + self._add_newline_to_br(top_node) + self._add_newline_to_li(top_node) # remove common tags from article tree. Leaves the text intact - lxml.etree.strip_tags(self.top_node, "b", "strong", "i", "br", "sup") + lxml.etree.strip_tags(top_node, "b", "strong", "i", "br", "sup") + + self._remove_empty_tags(top_node) + self._remove_trailing_media_div(top_node) + text = self._convert_to_text(top_node) - self.remove_empty_tags() - self.remove_trailing_media_div() - text = self.convert_to_text() - # print(parsers.nodeToString(self.get_top_node())) return (text, html) - def convert_to_text(self): + def _convert_to_text(self, top_node: lxml.html.HtmlElement): txts = [] - for node in list(self.get_top_node()): + for node in list(top_node): try: txt = parsers.get_text(node) except ValueError as err: # lxml error @@ -107,47 +81,45 @@ def convert_to_text(self): txts.extend(txt_lis) return "\n\n".join(txts) - def convert_to_html(self): + def _convert_to_html(self, top_node: lxml.html.HtmlElement): article_cleaner = lxml.html.clean.Cleaner() article_cleaner.javascript = True article_cleaner.style = True article_cleaner.remove_unknown_tags = False - article_cleaner.allow_tags = CLEAN_ARTICLE_TAGS + article_cleaner.allow_tags = settings.CLEAN_ARTICLE_TAGS - cleaned_node = article_cleaner.clean_html(self.get_top_node()) + cleaned_node = article_cleaner.clean_html(top_node) return parsers.node_to_string(cleaned_node) - def add_newline_to_br(self): - for e in parsers.get_tags(self.top_node, tag="br"): + def _add_newline_to_br(self, top_node: lxml.html.HtmlElement): + for e in parsers.get_tags(top_node, tag="br"): e.text = r"\n" - def add_newline_to_li(self): - for e in parsers.get_tags(self.top_node, tag="ul"): + def _add_newline_to_li(self, top_node: lxml.html.HtmlElement): + for e in parsers.get_tags(top_node, tag="ul"): li_list = parsers.get_tags(e, tag="li") for li in li_list[:-1]: li.text = parsers.get_text(li) + r"\n" for c in li.getchildren(): parsers.remove(c) - def remove_negativescores_nodes(self): + def _remove_negativescores_nodes(self, top_node: lxml.html.HtmlElement): """If there are elements inside our top node that have a negative gravity score, let's give em the boot. """ - if self.top_node is None: - return - gravity_items = self.top_node.xpath(".//*[@gravityScore]") + gravity_items = top_node.xpath(".//*[@gravityScore]") for item in gravity_items: - score = parsers.get_attribute(item, "gravityScore") - score = float(score) if score else 0 + score = item.attrib.get("gravityScore", "0") + score = float(score) if score < 1: item.getparent().remove(item) - def remove_empty_tags(self): + def _remove_empty_tags(self, top_node: lxml.html.HtmlElement): """It's common in top_node to have tags that are filled with data in their properties but do not have any displayable text. """ - all_nodes = parsers.get_tags(self.get_top_node()) + all_nodes = parsers.get_tags(top_node) all_nodes.reverse() for el in all_nodes: tag = el.tag @@ -160,7 +132,7 @@ def remove_empty_tags(self): ): parsers.remove(el) - def remove_trailing_media_div(self): + def _remove_trailing_media_div(self, top_node: lxml.html.HtmlElement): """Punish the *last top level* node in the top_node if it's DOM depth is too deep. Many media non-content links are eliminated: "related", "loading gallery", etc. It skips removal if @@ -183,7 +155,7 @@ def get_depth(node, depth=1): max_depth = e_depth return max_depth - top_level_nodes = self.get_top_node().getchildren() + top_level_nodes = top_node.getchildren() if len(top_level_nodes) < 3: return diff --git a/newspaper/settings.py b/newspaper/settings.py index 9fa23ac..643533f 100644 --- a/newspaper/settings.py +++ b/newspaper/settings.py @@ -55,6 +55,35 @@ "text_cleaned", ] +# Tags we allow to be left in the cleaned article body +CLEAN_ARTICLE_TAGS = [ + "a", + "span", + "p", + "br", + "strong", + "b", + "em", + "i", + "tt", + "code", + "pre", + "blockquote", + "img", + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "ul", + "ol", + "li", + "dl", + "dt", + "dd", +] + # Error log LOGFILE = TOP_DIRECTORY / f"newspaper_errors_{__version__}.log"