refactor(parse): 💄 Tidiniing up of the Outputformatter class. Removin…

…g unused attriutes and methods. Markin private methods as private.
AndyTheFactory · Nov 21, 2023 · 2259f56 · 2259f56
1 parent 31b9079
commit 2259f56
Show file tree

Hide file tree

Showing 5 changed files with 90 additions and 89 deletions.
diff --git a/docs/user_guide/advanced.rst b/docs/user_guide/advanced.rst
@@ -52,8 +52,7 @@ For example, you could:
     # we are calling the shortcut function ``article()`` which will do the
     # downloading and parsing for us and return an ``Article`` object.
 
-    a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html'
-        , keep_article_html=True)
+    a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html')
 
     print(a.article_html)
     # '<div> \n<p><strong>(CNN)</strong> -- Charles Smith insisted Sunda...'

diff --git a/newspaper/article.py b/newspaper/article.py
@@ -485,7 +485,6 @@ def parse(self) -> "Article":
 
             if self.config.use_meta_language:
                 self.extractor.update_language(self.meta_lang)
-                output_formatter.update_language(self.meta_lang)
 
         self.meta_site_name = metadata["site_name"]
         self.meta_description = metadata["description"]

diff --git a/newspaper/configuration.py b/newspaper/configuration.py
@@ -64,8 +64,10 @@ class Configuration:
             default True.
         follow_meta_refresh (bool): if True, it will follow meta refresh
             redirect when downloading an article. default False.
-        keep_article_html (bool): if True it will replace the
-            :any:`Article.html` property with the html of the body.
+        clean_article_html (bool): if True it will clean 'unnecessary' tags
+            from the article body html.
+            Affected property is :any:`Article.article_html`.
+            Default True.
         http_success_only (bool): if True, it will raise an ``ArticleException``
              if the html status_code is >= 400 (e.g. 404 page)
         stopwords_class (obj): unique stopword classes for oriental languages,
@@ -163,7 +165,7 @@ def __init__(self):
         self._use_meta_language = True
 
         # You may keep the html of just the main article body
-        self.keep_article_html = False
+        self.clean_article_html = True
 
         # Fail for error responses (e.g. 404 page)
         self.http_success_only = True

diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py
@@ -7,93 +7,67 @@
 """
 from html import unescape
 import logging
+from typing import Tuple
 
 import lxml
-import newspaper.parsers as parsers
-from .text import innerTrim
-
-CLEAN_ARTICLE_TAGS = [
-    "a",
-    "span",
-    "p",
-    "br",
-    "strong",
-    "b",
-    "em",
-    "i",
-    "tt",
-    "code",
-    "pre",
-    "blockquote",
-    "img",
-    "h1",
-    "h2",
-    "h3",
-    "h4",
-    "h5",
-    "h6",
-    "ul",
-    "ol",
-    "li",
-    "dl",
-    "dt",
-    "dd",
-]
+from newspaper import parsers
+from newspaper.configuration import Configuration
+from newspaper.text import innerTrim
+from newspaper import settings
 
 log = logging.getLogger(__name__)
 
 
 class OutputFormatter:
-    def __init__(self, config):
-        self.top_node = None
-        self.config = config
-        self.language = config.language
-        self.stopwords_class = config.stopwords_class
-
-    def update_language(self, meta_lang):
-        """Required to be called before the extraction process in some
-        cases because the stopwords_class has to set in case the lang
-        is not latin based
+    """Class that converts the article top node into text, cleaning up
+    debris tags, replacing <br> with newlines, etc.
+
+    if `config.clean_article_html` is True, then the article's html is
+    cleaned as well. Only `settings.CLEAN_ARTICLE_TAGS` are allowed to
+    remain in the html.
+    """
+
+    def __init__(self, config=None):
+        self.config = config or Configuration()
+
+    def get_formatted(self, top_node: lxml.html.HtmlElement) -> Tuple[str, str]:
+        """Returns the body text of an article, and also the cleaned html body
+        article of the article.
+        Arguments:
+            top_node {lxml.html.HtmlElement} -- The top node element of the article
+        Returns:
+            Tuple[str, str] -- The body text of the article, and the cleaned
+            html body of the article
         """
-        if meta_lang:
-            self.language = meta_lang
-            self.stopwords_class = self.config.get_stopwords_class(meta_lang)
-
-    def get_top_node(self):
-        return self.top_node
-
-    def get_formatted(self, top_node):
-        """Returns the body text of an article, and also the body article
-        html if specified. Returns in (text, html) form
-        """
-        self.top_node = top_node
         html, text = "", ""
-        if self.top_node is None:
+        if top_node is None:
             return (text, html)
 
-        self.remove_negativescores_nodes()
+        self._remove_negativescores_nodes(top_node)
 
-        if self.config.keep_article_html:
-            html = self.convert_to_html()
+        if self.config.clean_article_html:
+            html = self._convert_to_html(top_node)
+        else:
+            html = parsers.node_to_string(top_node)
 
         # remove a tags from article tree. Leaves the text intact
-        lxml.etree.strip_tags(self.top_node, "a")
+        lxml.etree.strip_tags(top_node, "a")
 
-        self.add_newline_to_br()
-        self.add_newline_to_li()
+        self._add_newline_to_br(top_node)
+        self._add_newline_to_li(top_node)
 
         # remove common tags from article tree. Leaves the text intact
-        lxml.etree.strip_tags(self.top_node, "b", "strong", "i", "br", "sup")
+        lxml.etree.strip_tags(top_node, "b", "strong", "i", "br", "sup")
+
+        self._remove_empty_tags(top_node)
+        self._remove_trailing_media_div(top_node)
+        text = self._convert_to_text(top_node)
 
-        self.remove_empty_tags()
-        self.remove_trailing_media_div()
-        text = self.convert_to_text()
-        # print(parsers.nodeToString(self.get_top_node()))
         return (text, html)
 
-    def convert_to_text(self):
+    def _convert_to_text(self, top_node: lxml.html.HtmlElement):
         txts = []
-        for node in list(self.get_top_node()):
+        for node in list(top_node):
             try:
                 txt = parsers.get_text(node)
             except ValueError as err:  # lxml error
@@ -107,47 +81,45 @@ def convert_to_text(self):
                 txts.extend(txt_lis)
         return "\n\n".join(txts)
 
-    def convert_to_html(self):
+    def _convert_to_html(self, top_node: lxml.html.HtmlElement):
         article_cleaner = lxml.html.clean.Cleaner()
         article_cleaner.javascript = True
         article_cleaner.style = True
         article_cleaner.remove_unknown_tags = False
 
-        article_cleaner.allow_tags = CLEAN_ARTICLE_TAGS
+        article_cleaner.allow_tags = settings.CLEAN_ARTICLE_TAGS
 
-        cleaned_node = article_cleaner.clean_html(self.get_top_node())
+        cleaned_node = article_cleaner.clean_html(top_node)
         return parsers.node_to_string(cleaned_node)
 
-    def add_newline_to_br(self):
-        for e in parsers.get_tags(self.top_node, tag="br"):
+    def _add_newline_to_br(self, top_node: lxml.html.HtmlElement):
+        for e in parsers.get_tags(top_node, tag="br"):
             e.text = r"\n"
 
-    def add_newline_to_li(self):
-        for e in parsers.get_tags(self.top_node, tag="ul"):
+    def _add_newline_to_li(self, top_node: lxml.html.HtmlElement):
+        for e in parsers.get_tags(top_node, tag="ul"):
             li_list = parsers.get_tags(e, tag="li")
             for li in li_list[:-1]:
                 li.text = parsers.get_text(li) + r"\n"
                 for c in li.getchildren():
                     parsers.remove(c)
 
-    def remove_negativescores_nodes(self):
+    def _remove_negativescores_nodes(self, top_node: lxml.html.HtmlElement):
         """If there are elements inside our top node that have a
         negative gravity score, let's give em the boot.
         """
-        if self.top_node is None:
-            return
-        gravity_items = self.top_node.xpath(".//*[@gravityScore]")
+        gravity_items = top_node.xpath(".//*[@gravityScore]")
         for item in gravity_items:
-            score = parsers.get_attribute(item, "gravityScore")
-            score = float(score) if score else 0
+            score = item.attrib.get("gravityScore", "0")
+            score = float(score)
             if score < 1:
                 item.getparent().remove(item)
 
-    def remove_empty_tags(self):
+    def _remove_empty_tags(self, top_node: lxml.html.HtmlElement):
         """It's common in top_node to have tags that are filled with data
         in their properties but do not have any displayable text.
         """
-        all_nodes = parsers.get_tags(self.get_top_node())
+        all_nodes = parsers.get_tags(top_node)
         all_nodes.reverse()
         for el in all_nodes:
             tag = el.tag
@@ -160,7 +132,7 @@ def remove_empty_tags(self):
             ):
                 parsers.remove(el)
 
-    def remove_trailing_media_div(self):
+    def _remove_trailing_media_div(self, top_node: lxml.html.HtmlElement):
         """Punish the *last top level* node in the top_node if it's
         DOM depth is too deep. Many media non-content links are
         eliminated: "related", "loading gallery", etc. It skips removal if
@@ -183,7 +155,7 @@ def get_depth(node, depth=1):
                     max_depth = e_depth
             return max_depth
 
-        top_level_nodes = self.get_top_node().getchildren()
+        top_level_nodes = top_node.getchildren()
         if len(top_level_nodes) < 3:
             return
 

diff --git a/newspaper/settings.py b/newspaper/settings.py
@@ -55,6 +55,35 @@
     "text_cleaned",
 ]
 
+# Tags we allow to be left in the cleaned article body
+CLEAN_ARTICLE_TAGS = [
+    "a",
+    "span",
+    "p",
+    "br",
+    "strong",
+    "b",
+    "em",
+    "i",
+    "tt",
+    "code",
+    "pre",
+    "blockquote",
+    "img",
+    "h1",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "ul",
+    "ol",
+    "li",
+    "dl",
+    "dt",
+    "dd",
+]
+
 
 # Error log
 LOGFILE = TOP_DIRECTORY / f"newspaper_errors_{__version__}.log"