Skip to content

Commit

Permalink
refactor(parse): 💄 Tidiniing up of the Outputformatter class. Removin…
Browse files Browse the repository at this point in the history
…g unused attriutes and methods. Markin private methods as private.
  • Loading branch information
AndyTheFactory committed Nov 21, 2023
1 parent 31b9079 commit 2259f56
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 89 deletions.
3 changes: 1 addition & 2 deletions docs/user_guide/advanced.rst
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,7 @@ For example, you could:
# we are calling the shortcut function ``article()`` which will do the
# downloading and parsing for us and return an ``Article`` object.
a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html'
, keep_article_html=True)
a = article('http://www.cnn.com/2014/01/12/world/asia/north-korea-charles-smith/index.html')
print(a.article_html)
# '<div> \n<p><strong>(CNN)</strong> -- Charles Smith insisted Sunda...'
Expand Down
1 change: 0 additions & 1 deletion newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -485,7 +485,6 @@ def parse(self) -> "Article":

if self.config.use_meta_language:
self.extractor.update_language(self.meta_lang)
output_formatter.update_language(self.meta_lang)

self.meta_site_name = metadata["site_name"]
self.meta_description = metadata["description"]
Expand Down
8 changes: 5 additions & 3 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,10 @@ class Configuration:
default True.
follow_meta_refresh (bool): if True, it will follow meta refresh
redirect when downloading an article. default False.
keep_article_html (bool): if True it will replace the
:any:`Article.html` property with the html of the body.
clean_article_html (bool): if True it will clean 'unnecessary' tags
from the article body html.
Affected property is :any:`Article.article_html`.
Default True.
http_success_only (bool): if True, it will raise an ``ArticleException``
if the html status_code is >= 400 (e.g. 404 page)
stopwords_class (obj): unique stopword classes for oriental languages,
Expand Down Expand Up @@ -163,7 +165,7 @@ def __init__(self):
self._use_meta_language = True

# You may keep the html of just the main article body
self.keep_article_html = False
self.clean_article_html = True

# Fail for error responses (e.g. 404 page)
self.http_success_only = True
Expand Down
138 changes: 55 additions & 83 deletions newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,93 +7,67 @@
"""
from html import unescape
import logging
from typing import Tuple

import lxml
import newspaper.parsers as parsers
from .text import innerTrim

CLEAN_ARTICLE_TAGS = [
"a",
"span",
"p",
"br",
"strong",
"b",
"em",
"i",
"tt",
"code",
"pre",
"blockquote",
"img",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"dl",
"dt",
"dd",
]
from newspaper import parsers
from newspaper.configuration import Configuration
from newspaper.text import innerTrim
from newspaper import settings

log = logging.getLogger(__name__)


class OutputFormatter:
def __init__(self, config):
self.top_node = None
self.config = config
self.language = config.language
self.stopwords_class = config.stopwords_class

def update_language(self, meta_lang):
"""Required to be called before the extraction process in some
cases because the stopwords_class has to set in case the lang
is not latin based
"""Class that converts the article top node into text, cleaning up
debris tags, replacing <br> with newlines, etc.
if `config.clean_article_html` is True, then the article's html is
cleaned as well. Only `settings.CLEAN_ARTICLE_TAGS` are allowed to
remain in the html.
"""

def __init__(self, config=None):
self.config = config or Configuration()

def get_formatted(self, top_node: lxml.html.HtmlElement) -> Tuple[str, str]:
"""Returns the body text of an article, and also the cleaned html body
article of the article.
Arguments:
top_node {lxml.html.HtmlElement} -- The top node element of the article
Returns:
Tuple[str, str] -- The body text of the article, and the cleaned
html body of the article
"""
if meta_lang:
self.language = meta_lang
self.stopwords_class = self.config.get_stopwords_class(meta_lang)

def get_top_node(self):
return self.top_node

def get_formatted(self, top_node):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
html, text = "", ""
if self.top_node is None:
if top_node is None:
return (text, html)

self.remove_negativescores_nodes()
self._remove_negativescores_nodes(top_node)

if self.config.keep_article_html:
html = self.convert_to_html()
if self.config.clean_article_html:
html = self._convert_to_html(top_node)
else:
html = parsers.node_to_string(top_node)

# remove a tags from article tree. Leaves the text intact
lxml.etree.strip_tags(self.top_node, "a")
lxml.etree.strip_tags(top_node, "a")

self.add_newline_to_br()
self.add_newline_to_li()
self._add_newline_to_br(top_node)
self._add_newline_to_li(top_node)

# remove common tags from article tree. Leaves the text intact
lxml.etree.strip_tags(self.top_node, "b", "strong", "i", "br", "sup")
lxml.etree.strip_tags(top_node, "b", "strong", "i", "br", "sup")

self._remove_empty_tags(top_node)
self._remove_trailing_media_div(top_node)
text = self._convert_to_text(top_node)

self.remove_empty_tags()
self.remove_trailing_media_div()
text = self.convert_to_text()
# print(parsers.nodeToString(self.get_top_node()))
return (text, html)

def convert_to_text(self):
def _convert_to_text(self, top_node: lxml.html.HtmlElement):
txts = []
for node in list(self.get_top_node()):
for node in list(top_node):
try:
txt = parsers.get_text(node)
except ValueError as err: # lxml error
Expand All @@ -107,47 +81,45 @@ def convert_to_text(self):
txts.extend(txt_lis)
return "\n\n".join(txts)

def convert_to_html(self):
def _convert_to_html(self, top_node: lxml.html.HtmlElement):
article_cleaner = lxml.html.clean.Cleaner()
article_cleaner.javascript = True
article_cleaner.style = True
article_cleaner.remove_unknown_tags = False

article_cleaner.allow_tags = CLEAN_ARTICLE_TAGS
article_cleaner.allow_tags = settings.CLEAN_ARTICLE_TAGS

cleaned_node = article_cleaner.clean_html(self.get_top_node())
cleaned_node = article_cleaner.clean_html(top_node)
return parsers.node_to_string(cleaned_node)

def add_newline_to_br(self):
for e in parsers.get_tags(self.top_node, tag="br"):
def _add_newline_to_br(self, top_node: lxml.html.HtmlElement):
for e in parsers.get_tags(top_node, tag="br"):
e.text = r"\n"

def add_newline_to_li(self):
for e in parsers.get_tags(self.top_node, tag="ul"):
def _add_newline_to_li(self, top_node: lxml.html.HtmlElement):
for e in parsers.get_tags(top_node, tag="ul"):
li_list = parsers.get_tags(e, tag="li")
for li in li_list[:-1]:
li.text = parsers.get_text(li) + r"\n"
for c in li.getchildren():
parsers.remove(c)

def remove_negativescores_nodes(self):
def _remove_negativescores_nodes(self, top_node: lxml.html.HtmlElement):
"""If there are elements inside our top node that have a
negative gravity score, let's give em the boot.
"""
if self.top_node is None:
return
gravity_items = self.top_node.xpath(".//*[@gravityScore]")
gravity_items = top_node.xpath(".//*[@gravityScore]")
for item in gravity_items:
score = parsers.get_attribute(item, "gravityScore")
score = float(score) if score else 0
score = item.attrib.get("gravityScore", "0")
score = float(score)
if score < 1:
item.getparent().remove(item)

def remove_empty_tags(self):
def _remove_empty_tags(self, top_node: lxml.html.HtmlElement):
"""It's common in top_node to have tags that are filled with data
in their properties but do not have any displayable text.
"""
all_nodes = parsers.get_tags(self.get_top_node())
all_nodes = parsers.get_tags(top_node)
all_nodes.reverse()
for el in all_nodes:
tag = el.tag
Expand All @@ -160,7 +132,7 @@ def remove_empty_tags(self):
):
parsers.remove(el)

def remove_trailing_media_div(self):
def _remove_trailing_media_div(self, top_node: lxml.html.HtmlElement):
"""Punish the *last top level* node in the top_node if it's
DOM depth is too deep. Many media non-content links are
eliminated: "related", "loading gallery", etc. It skips removal if
Expand All @@ -183,7 +155,7 @@ def get_depth(node, depth=1):
max_depth = e_depth
return max_depth

top_level_nodes = self.get_top_node().getchildren()
top_level_nodes = top_node.getchildren()
if len(top_level_nodes) < 3:
return

Expand Down
29 changes: 29 additions & 0 deletions newspaper/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,35 @@
"text_cleaned",
]

# Tags we allow to be left in the cleaned article body
CLEAN_ARTICLE_TAGS = [
"a",
"span",
"p",
"br",
"strong",
"b",
"em",
"i",
"tt",
"code",
"pre",
"blockquote",
"img",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"dl",
"dt",
"dd",
]


# Error log
LOGFILE = TOP_DIRECTORY / f"newspaper_errors_{__version__}.log"
Expand Down

0 comments on commit 2259f56

Please sign in to comment.