From 737c2269878af67c3e1435d9b858821b95cd1037 Mon Sep 17 00:00:00 2001 From: Andrei Date: Sat, 23 Dec 2023 00:39:40 +0200 Subject: [PATCH] feat(tests): :sparkles: added evaluation script to test against the dataset from https://github.com/scrapinghub/article-extraction-benchmark/ --- .gitignore | 1 + newspaper/extractors/metadata_extractor.py | 2 +- newspaper/utils/__init__.py | 27 +++++- tests/evaluation/evaluate.py | 100 +++++++++++++++++++++ 4 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 tests/evaluation/evaluate.py diff --git a/.gitignore b/.gitignore index 92fae30..b933e13 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ .DS_Store .idea .pypirc +.conda # C extensions *.so diff --git a/newspaper/extractors/metadata_extractor.py b/newspaper/extractors/metadata_extractor.py index 6bd31a1..e40fbfe 100644 --- a/newspaper/extractors/metadata_extractor.py +++ b/newspaper/extractors/metadata_extractor.py @@ -49,7 +49,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]: """ def get_if_valid(s: str) -> Optional[str]: - if not s or len(s) < 2: + if s is None or len(s) < 2: return None s = s[:2] if re.search(RE_LANG, s): diff --git a/newspaper/utils/__init__.py b/newspaper/utils/__init__.py index 9cbaa27..405683c 100644 --- a/newspaper/utils/__init__.py +++ b/newspaper/utils/__init__.py @@ -18,7 +18,6 @@ import sys import threading import time - from hashlib import sha1 from bs4 import BeautifulSoup @@ -317,6 +316,32 @@ def extend_config(config, config_items): return config +def progressbar(it, prefix="", size=60, out=sys.stdout): + """Display a simple progress bar without + heavy dependencies like tqdm""" + count = len(it) + start = time.time() + + def show(j): + x = int(size * j / count) + remaining = ((time.time() - start) / j) * (count - j) + + mins, sec = divmod(remaining, 60) + time_str = f"{int(mins):02}:{sec:05.2f}" + + print( + f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} Est wait {time_str}", + end="\r", + file=out, + flush=True, + ) + + for i, item in enumerate(it): + yield item + show(i + 1) + print("\n", flush=True, file=out) + + def print_node_tree(node, header="", last=True, with_gravity=True): """Prints out the html node tree for nodes with gravity scores debugging method diff --git a/tests/evaluation/evaluate.py b/tests/evaluation/evaluate.py new file mode 100644 index 0000000..837c8f9 --- /dev/null +++ b/tests/evaluation/evaluate.py @@ -0,0 +1,100 @@ +from urllib.parse import urljoin +import requests +import argparse +import json +import gzip +from pathlib import Path +import newspaper +from nltk.translate import bleu_score + +from newspaper.utils import progressbar + + +def read_or_download_json(url_or_path): + """Reads a json file from a url or a local path""" + if url_or_path.startswith("http"): + return requests.get(url_or_path, timeout=(5, 10)).json() + else: + with open(url_or_path, "r", encoding="utf-8") as f: + return json.load(f) + + +def get_html(url_or_path): + """Gets the html from a url or a local path""" + if url_or_path.startswith("http"): + content = requests.get(url_or_path, timeout=(5, 10)).content + else: + with open(url_or_path, "rb", encoding="utf-8") as f: + content = f.read() + + if not url_or_path.lower().endswith(".gz"): + if isinstance(content, bytes): + content = content.decode("utf-8") + return content + + content = gzip.decompress(content) + return content.decode("utf-8") + + +def main(args): + ground_truth = read_or_download_json(args.ground_truth) + results = {} + for filename, expected_article in progressbar(ground_truth.items()): + if not filename.endswith(".html") and not filename.endswith(".html.gz"): + filename += ".html.gz" + + if args.html_folder.startswith("http"): + html = get_html(urljoin(args.html_folder, filename)) + else: + html = get_html(Path(args.html_folder) / filename) + + article = newspaper.article(url=expected_article["url"], input_html=html) + + parsed_result = article.text + + results[filename] = { + "url": expected_article["url"], + "truth": expected_article["articleBody"], + "parsed": parsed_result, + "bleu_score": bleu_score.sentence_bleu( + [expected_article["articleBody"]], parsed_result + ), + "precision": bleu_score.modified_precision( + [expected_article["articleBody"]], parsed_result, n=5 + ), + } + copora_score = bleu_score.corpus_bleu( + [[result["truth"]] for result in results.values()], + [result["parsed"] for result in results.values()], + ) + sorted_results = sorted( + [(k, result["url"], result["bleu_score"]) for k, result in results.items()], + key=lambda x: x[2], + reverse=False, + ) + + print(f"Corpus BLEU score: {copora_score}") + + print("Top 10 worst results:") + for filename, url, score in sorted_results[:10]: + print(f"{score} {filename} {url} ") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--ground-truth", + type=str, + default="https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/master/ground-truth.json", + help="URL to the groundtruth json or a local path to the json file", + ) + parser.add_argument( + "--html-folder", + type=str, + default="https://github.com/scrapinghub/article-extraction-benchmark/raw/master/html/", + help=( + "URL to the folder containing the html files or a local path to the folder" + ), + ) + args = parser.parse_args() + main(args)