Skip to content

Commit

Permalink
feat(parse): 🚀 improvements in the article body extraction. some sect…
Browse files Browse the repository at this point in the history
…ions that were ignored are now added to the extracted text.

- better article body extraction
- removal of divs that seem to be ads or boilerplate
- clearer rules for trailing text removal (e.g. comments, author info, etc.)
  • Loading branch information
AndyTheFactory committed Dec 19, 2023
1 parent 22327d8 commit 1af12d2
Show file tree
Hide file tree
Showing 21 changed files with 567 additions and 278 deletions.
8 changes: 5 additions & 3 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -500,16 +500,18 @@ def parse(self) -> "Article":

# Top node in the original documentDOM
self.top_node = self.extractor.calculate_best_node(self.doc)
# Off-tree Node containing the top node and any relevant siblings
self._top_node_complemented = self.extractor.top_node_complemented

# Top node in the cleaned version of the DOM
self.clean_top_node = self.extractor.calculate_best_node(self.clean_doc)

self.set_movies(self.extractor.get_videos(self.doc, self.top_node))

if self.top_node is not None:
# Off-tree Node containing the top node and any relevant siblings
self._top_node_complemented = self.extractor.top_node_complemented

self._top_node_complemented = document_cleaner.clean(
self._top_node_complemented
)
text, article_html = output_formatter.get_formatted(
self._top_node_complemented
)
Expand Down
77 changes: 58 additions & 19 deletions newspaper/cleaners.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,10 @@ def __init__(self, config):
"|konafilter|KonaFilter|breadcrumbs|^fn$|wp-caption-text"
"|legende|ajoutVideo|timestamp|js_replies"
)

self.remove_nodes_related_re = (
r"related[-\s\_]?(search|topics|media|info|tags|article|content|links)|"
r"(search|topics|media|info|tags|article|content|links)[-\s\_]?related"
)
self.div_to_p_re = r"<(a|blockquote|dl|div|img|ol|p|pre|table|ul)"
self.caption_re = "^caption$"
self.google_re = " google "
Expand Down Expand Up @@ -70,11 +73,20 @@ def clean(self, doc_to_clean):
doc_to_clean = self.remove_nodes_regex(
doc_to_clean, self.facebook_broadcasting_re
)
# Remove "related" sections
doc_to_clean = self.remove_nodes_regex(
doc_to_clean, self.remove_nodes_related_re
)

# Remove spans inside of paragraphs
doc_to_clean = self.clean_para_spans(doc_to_clean)
doc_to_clean = self.div_to_para(doc_to_clean, "div")
doc_to_clean = self.div_to_para(doc_to_clean, "span")
doc_to_clean = self.div_to_para(doc_to_clean, "section")

doc_to_clean = self.tag_to_para(doc_to_clean, "div")
# doc_to_clean = self.tag_to_para(doc_to_clean, "span")
doc_to_clean = self.tag_to_para(doc_to_clean, "section")

doc_to_clean = self.reduce_article(doc_to_clean)

return doc_to_clean

def clean_body_classes(self, doc):
Expand Down Expand Up @@ -262,24 +274,51 @@ def get_child_nodes_with_text(node):

return nodes_to_return

def div_to_para(self, doc, dom_type):
bad_divs = 0
else_divs = 0
def tag_to_para(self, doc, dom_type):
divs = parsers.get_tags(doc, tag=dom_type)
tags = ["a", "blockquote", "dl", "div", "img", "ol", "p", "pre", "table", "ul"]
for div in divs:
if div is None:
continue

items = parsers.get_elements_by_tagslist(div, tags)
if div is not None and len(items) == 0:
if len(items) == 0:
div.attrib["_initial_tag"] = div.tag
div.tag = "p"
bad_divs += 1
elif div is not None:
replace_nodes = self.get_replacement_nodes(doc, div)
replace_nodes = [n for n in replace_nodes if n is not None]
attrib = copy.deepcopy(div.attrib)
div.clear()
for i, node in enumerate(replace_nodes):
div.insert(i, node)
for name, value in attrib.items():
div.set(name, value)
else_divs += 1
continue

replace_nodes = self.get_replacement_nodes(doc, div)
replace_nodes = [n for n in replace_nodes if n is not None]
attrib = copy.deepcopy(div.attrib)
div.clear()
for i, node in enumerate(replace_nodes):
div.insert(i, node)
for name, value in attrib.items():
div.set(name, value)
return doc

def reduce_article(self, doc):
body_tag = parsers.get_tags(doc, tag="body")
if not body_tag:
return doc

for item in body_tag[0].iter():
if item.tag not in [
"p",
"br",
"img",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"body",
"article",
"section",
]:
if item.text is None and item.tail is None:
item.drop_tag()

return doc
Loading

0 comments on commit 1af12d2

Please sign in to comment.