From 737c2269878af67c3e1435d9b858821b95cd1037 Mon Sep 17 00:00:00 2001
From: Andrei <andrei@thephpfactory.com>
Date: Sat, 23 Dec 2023 00:39:40 +0200
Subject: [PATCH] feat(tests): :sparkles: added evaluation script to test
 against the dataset from
 https://github.com/scrapinghub/article-extraction-benchmark/

---
 .gitignore                                 |   1 +
 newspaper/extractors/metadata_extractor.py |   2 +-
 newspaper/utils/__init__.py                |  27 +++++-
 tests/evaluation/evaluate.py               | 100 +++++++++++++++++++++
 4 files changed, 128 insertions(+), 2 deletions(-)
 create mode 100644 tests/evaluation/evaluate.py

diff --git a/.gitignore b/.gitignore
index 92fae30..b933e13 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,6 +3,7 @@
 .DS_Store
 .idea
 .pypirc
+.conda
 
 # C extensions
 *.so
diff --git a/newspaper/extractors/metadata_extractor.py b/newspaper/extractors/metadata_extractor.py
index 6bd31a1..e40fbfe 100644
--- a/newspaper/extractors/metadata_extractor.py
+++ b/newspaper/extractors/metadata_extractor.py
@@ -49,7 +49,7 @@ def _get_meta_language(self, doc: lxml.html.Element) -> Optional[str]:
         """
 
         def get_if_valid(s: str) -> Optional[str]:
-            if not s or len(s) < 2:
+            if s is None or len(s) < 2:
                 return None
             s = s[:2]
             if re.search(RE_LANG, s):
diff --git a/newspaper/utils/__init__.py b/newspaper/utils/__init__.py
index 9cbaa27..405683c 100644
--- a/newspaper/utils/__init__.py
+++ b/newspaper/utils/__init__.py
@@ -18,7 +18,6 @@
 import sys
 import threading
 import time
-
 from hashlib import sha1
 
 from bs4 import BeautifulSoup
@@ -317,6 +316,32 @@ def extend_config(config, config_items):
     return config
 
 
+def progressbar(it, prefix="", size=60, out=sys.stdout):
+    """Display a simple progress bar without
+    heavy dependencies like tqdm"""
+    count = len(it)
+    start = time.time()
+
+    def show(j):
+        x = int(size * j / count)
+        remaining = ((time.time() - start) / j) * (count - j)
+
+        mins, sec = divmod(remaining, 60)
+        time_str = f"{int(mins):02}:{sec:05.2f}"
+
+        print(
+            f"{prefix}[{u'█'*x}{('.'*(size-x))}] {j}/{count} Est wait {time_str}",
+            end="\r",
+            file=out,
+            flush=True,
+        )
+
+    for i, item in enumerate(it):
+        yield item
+        show(i + 1)
+    print("\n", flush=True, file=out)
+
+
 def print_node_tree(node, header="", last=True, with_gravity=True):
     """Prints out the html node tree for nodes with gravity scores
     debugging method
diff --git a/tests/evaluation/evaluate.py b/tests/evaluation/evaluate.py
new file mode 100644
index 0000000..837c8f9
--- /dev/null
+++ b/tests/evaluation/evaluate.py
@@ -0,0 +1,100 @@
+from urllib.parse import urljoin
+import requests
+import argparse
+import json
+import gzip
+from pathlib import Path
+import newspaper
+from nltk.translate import bleu_score
+
+from newspaper.utils import progressbar
+
+
+def read_or_download_json(url_or_path):
+    """Reads a json file from a url or a local path"""
+    if url_or_path.startswith("http"):
+        return requests.get(url_or_path, timeout=(5, 10)).json()
+    else:
+        with open(url_or_path, "r", encoding="utf-8") as f:
+            return json.load(f)
+
+
+def get_html(url_or_path):
+    """Gets the html from a url or a local path"""
+    if url_or_path.startswith("http"):
+        content = requests.get(url_or_path, timeout=(5, 10)).content
+    else:
+        with open(url_or_path, "rb", encoding="utf-8") as f:
+            content = f.read()
+
+    if not url_or_path.lower().endswith(".gz"):
+        if isinstance(content, bytes):
+            content = content.decode("utf-8")
+        return content
+
+    content = gzip.decompress(content)
+    return content.decode("utf-8")
+
+
+def main(args):
+    ground_truth = read_or_download_json(args.ground_truth)
+    results = {}
+    for filename, expected_article in progressbar(ground_truth.items()):
+        if not filename.endswith(".html") and not filename.endswith(".html.gz"):
+            filename += ".html.gz"
+
+        if args.html_folder.startswith("http"):
+            html = get_html(urljoin(args.html_folder, filename))
+        else:
+            html = get_html(Path(args.html_folder) / filename)
+
+        article = newspaper.article(url=expected_article["url"], input_html=html)
+
+        parsed_result = article.text
+
+        results[filename] = {
+            "url": expected_article["url"],
+            "truth": expected_article["articleBody"],
+            "parsed": parsed_result,
+            "bleu_score": bleu_score.sentence_bleu(
+                [expected_article["articleBody"]], parsed_result
+            ),
+            "precision": bleu_score.modified_precision(
+                [expected_article["articleBody"]], parsed_result, n=5
+            ),
+        }
+    copora_score = bleu_score.corpus_bleu(
+        [[result["truth"]] for result in results.values()],
+        [result["parsed"] for result in results.values()],
+    )
+    sorted_results = sorted(
+        [(k, result["url"], result["bleu_score"]) for k, result in results.items()],
+        key=lambda x: x[2],
+        reverse=False,
+    )
+
+    print(f"Corpus BLEU score: {copora_score}")
+
+    print("Top 10 worst results:")
+    for filename, url, score in sorted_results[:10]:
+        print(f"{score} {filename} {url} ")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--ground-truth",
+        type=str,
+        default="https://raw.githubusercontent.com/scrapinghub/article-extraction-benchmark/master/ground-truth.json",
+        help="URL to the groundtruth json or a local path to the json file",
+    )
+    parser.add_argument(
+        "--html-folder",
+        type=str,
+        default="https://github.com/scrapinghub/article-extraction-benchmark/raw/master/html/",
+        help=(
+            "URL to the folder containing the html files or a local path to the folder"
+        ),
+    )
+    args = parser.parse_args()
+    main(args)