Skip to content

Commit

Permalink
feat(tests): ✨ added evaluation script to test against the dataset from
Browse files Browse the repository at this point in the history
  • Loading branch information
AndyTheFactory committed Dec 22, 2023
1 parent e73ffd9 commit 737c226
Show file tree
Hide file tree
Showing 4 changed files with 128 additions and 2 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
.DS_Store
.idea
.pypirc
.conda

# C extensions
*.so
Expand Down
2 changes: 1 addition & 1 deletion newspaper/extractors/metadata_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]:
"""

def get_if_valid(s: str) -> Optional[str]:
if not s or len(s) < 2:
if s is None or len(s) < 2:
return None
s = s[:2]
if re.search(RE_LANG, s):
Expand Down
27 changes: 26 additions & 1 deletion newspaper/utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@
import sys
import threading
import time

from hashlib import sha1

from bs4 import BeautifulSoup
Expand Down Expand Up @@ -317,6 +316,32 @@ def extend_config(config, config_items):
return config


def progressbar(it, prefix="", size=60, out=sys.stdout):
"""Display a simple progress bar without
heavy dependencies like tqdm"""
count = len(it)
start = time.time()

def show(j):
x = int(size * j / count)
remaining = ((time.time() - start) / j) * (count - j)

mins, sec = divmod(remaining, 60)
time_str = f"{int(mins):02}:{sec:05.2f}"

print(
f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} Est wait {time_str}",
end="\r",
file=out,
flush=True,
)

for i, item in enumerate(it):
yield item
show(i + 1)
print("\n", flush=True, file=out)


def print_node_tree(node, header="", last=True, with_gravity=True):
"""Prints out the html node tree for nodes with gravity scores
debugging method
Expand Down
100 changes: 100 additions & 0 deletions tests/evaluation/evaluate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from urllib.parse import urljoin
import requests
import argparse
import json
import gzip
from pathlib import Path
import newspaper
from nltk.translate import bleu_score

from newspaper.utils import progressbar


def read_or_download_json(url_or_path):
"""Reads a json file from a url or a local path"""
if url_or_path.startswith("http"):
return requests.get(url_or_path, timeout=(5, 10)).json()
else:
with open(url_or_path, "r", encoding="utf-8") as f:
return json.load(f)


def get_html(url_or_path):
"""Gets the html from a url or a local path"""
if url_or_path.startswith("http"):
content = requests.get(url_or_path, timeout=(5, 10)).content
else:
with open(url_or_path, "rb", encoding="utf-8") as f:
content = f.read()

if not url_or_path.lower().endswith(".gz"):
if isinstance(content, bytes):
content = content.decode("utf-8")
return content

content = gzip.decompress(content)
return content.decode("utf-8")


def main(args):
ground_truth = read_or_download_json(args.ground_truth)
results = {}
for filename, expected_article in progressbar(ground_truth.items()):
if not filename.endswith(".html") and not filename.endswith(".html.gz"):
filename += ".html.gz"

if args.html_folder.startswith("http"):
html = get_html(urljoin(args.html_folder, filename))
else:
html = get_html(Path(args.html_folder) / filename)

article = newspaper.article(url=expected_article["url"], input_html=html)

parsed_result = article.text

results[filename] = {
"url": expected_article["url"],
"truth": expected_article["articleBody"],
"parsed": parsed_result,
"bleu_score": bleu_score.sentence_bleu(
[expected_article["articleBody"]], parsed_result
),
"precision": bleu_score.modified_precision(
[expected_article["articleBody"]], parsed_result, n=5
),
}
copora_score = bleu_score.corpus_bleu(
[[result["truth"]] for result in results.values()],
[result["parsed"] for result in results.values()],
)
sorted_results = sorted(
[(k, result["url"], result["bleu_score"]) for k, result in results.items()],
key=lambda x: x[2],
reverse=False,
)

print(f"Corpus BLEU score: {copora_score}")

print("Top 10 worst results:")
for filename, url, score in sorted_results[:10]:
print(f"{score} {filename} {url} ")


if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--ground-truth",
type=str,
default="https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/master/ground-truth.json",
help="URL to the groundtruth json or a local path to the json file",
)
parser.add_argument(
"--html-folder",
type=str,
default="https://github.com/scrapinghub/article-extraction-benchmark/raw/master/html/",
help=(
"URL to the folder containing the html files or a local path to the folder"
),
)
args = parser.parse_args()
main(args)

0 comments on commit 737c226

Please sign in to comment.