Skip to content

Commit

Permalink
feat(lang): ⚡ Rework of tokenizer. Additionally implemented new (easi…
Browse files Browse the repository at this point in the history
…er) way of adding languages to the packet
  • Loading branch information
AndyTheFactory committed Jan 22, 2024
1 parent 1071667 commit 0833859
Show file tree
Hide file tree
Showing 36 changed files with 1,154 additions and 585 deletions.
Binary file added docs/user_guide/assets/logo_v1_150.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added docs/user_guide/assets/logo_v1_670.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
14 changes: 7 additions & 7 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import requests

from newspaper.exceptions import ArticleException
from newspaper.text import StopWords

from . import network
from . import nlp
Expand Down Expand Up @@ -489,9 +490,6 @@ def parse(self) -> "Article":
if metadata["language"] in get_available_languages():
self.meta_lang = metadata["language"]

if self.config.use_meta_language:
self.extractor.update_language(self.meta_lang)

self.meta_site_name = metadata["site_name"]
self.meta_description = metadata["description"]
self.canonical_link = metadata["canonical_link"]
Expand Down Expand Up @@ -616,9 +614,11 @@ def nlp(self):
self.throw_if_not_downloaded_verbose()
self.throw_if_not_parsed_verbose()

nlp.load_stopwords(self.config.language)
keywords = nlp.keywords(self.text, self.config.max_keywords)
for k, v in nlp.keywords(self.title, self.config.max_keywords).items():
stopwords = StopWords(self.config.language)
keywords = nlp.keywords(self.text, stopwords, self.config.max_keywords)
for k, v in nlp.keywords(
self.title, stopwords, self.config.max_keywords
).items():
if k in keywords:
keywords[k] += v
keywords[k] /= 2
Expand All @@ -634,7 +634,7 @@ def nlp(self):
max_sents = self.config.max_summary_sent

summary_sents = nlp.summarize(
title=self.title, text=self.text, max_sents=max_sents
title=self.title, text=self.text, stopwords=stopwords, max_sents=max_sents
)
self.summary = "\n".join(summary_sents)

Expand Down
2 changes: 1 addition & 1 deletion newspaper/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ def get_arparse() -> argparse.ArgumentParser:
parser.add_argument(
"--output-format",
"-of",
choices=["csv", "json", "text"],
choices=["csv", "json", "text"], # TODO: add stdout as an option
default="json",
help="The output format of the parsed article.",
)
Expand Down
40 changes: 0 additions & 40 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,6 @@

from newspaper.utils import get_available_languages

from .text import (
StopWords,
StopWordsArabic,
StopWordsChinese,
StopWordsKorean,
StopWordsHindi,
StopWordsJapanese,
StopWordsThai,
)
from .version import __version__

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -76,8 +67,6 @@ class Configuration:
Default True.
http_success_only (bool): if True, it will raise an :any:`ArticleException`
if the html status_code is >= 400 (e.g. 404 page). default True.
stopwords_class (obj): unique stopword classes for oriental languages,
don't toggle
requests_params (dict): Any of the params for the
`get call`_ from ``requests`` library
number_threads (int): number of threads to use for multi-threaded downloads
Expand Down Expand Up @@ -181,9 +170,6 @@ def __init__(self):
# English is the fallback
self._language = "en"

# Unique stopword classes for oriental languages, don't toggle
self.stopwords_class = StopWords

# Params for get call from `requests` lib
self.requests_params = {
"timeout": 7,
Expand Down Expand Up @@ -289,7 +275,6 @@ def language(self, value: str):

# Set oriental language stopword class
self._language = value
self.stopwords_class = self.get_stopwords_class(value)

@property
def use_meta_language(self):
Expand All @@ -302,31 +287,6 @@ def use_meta_language(self):
"""
return self._use_meta_language

@staticmethod
def get_stopwords_class(language: str):
"""Get the stopwords class for the given language.
Arguments:
language (str): The language for which it will return the StopWords object.
Returns:
class(StopWords): The stopwords class for the given language.
"""
if language == "ko":
return StopWordsKorean
elif language == "hi":
return StopWordsHindi
elif language == "zh":
return StopWordsChinese
# Persian and Arabic Share an alphabet
# There is a persian parser https://github.com/sobhe/hazm,
# but nltk is likely sufficient
elif language == "ar" or language == "fa":
return StopWordsArabic
elif language == "ja":
return StopWordsJapanese
elif language == "th":
return StopWordsThai
return StopWords

@property
def MIN_WORD_COUNT(self):
warn(
Expand Down
21 changes: 7 additions & 14 deletions newspaper/extractors/articlebody_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import lxml
import newspaper.extractors.defines as defines
import newspaper.parsers as parsers
from newspaper.text import StopWords

score_weights = {
"start_boosting_score": 1.0,
Expand All @@ -22,15 +23,14 @@ def __init__(self, config):
self.config = config
self.top_node = None
self.top_node_complemented = None
self.stopwords_class = config.stopwords_class
self.language = config.language

def parse(self, doc: lxml.html.Element):
"""_summary_
Args:
doc (lxml.html.Element): _description_
"""
self.stopwords = StopWords(self.config.language)
self.top_node = self.calculate_best_node(doc)
self.top_node_complemented = self.complement_with_siblings(self.top_node)

Expand All @@ -47,9 +47,8 @@ def calculate_best_node(self, doc):
text_node = parsers.get_text(node)
if not text_node:
continue
word_stats = self.stopwords_class(
language=self.language
).get_stopword_count(text_node)

word_stats = self.stopwords.get_stopword_count(text_node)
high_link_density = parsers.is_highlink_density(node)
if word_stats.stop_word_count > 2 and not high_link_density:
nodes_with_text.append(node)
Expand Down Expand Up @@ -80,9 +79,7 @@ def calculate_best_node(self, doc):
boost_score = score_weights["negative_score_boost"]

text_node = parsers.get_text(node)
word_stats = self.stopwords_class(
language=self.language
).get_stopword_count(text_node)
word_stats = self.stopwords.get_stopword_count(text_node)
upscore = int(word_stats.stop_word_count + boost_score)

parent_node = node.getparent()
Expand Down Expand Up @@ -170,9 +167,7 @@ def is_boostable(self, node):
if steps_away >= max_stepsaway_from_node:
return False
paragraph_text = parsers.get_text(current_node)
word_stats = self.stopwords_class(
language=self.language
).get_stopword_count(paragraph_text)
word_stats = self.stopwords.get_stopword_count(paragraph_text)
if word_stats.stop_word_count > minimum_stopword_count:
return True
steps_away += 1
Expand Down Expand Up @@ -288,9 +283,7 @@ def get_plausible_content(self, node, baseline_score, score_weight=0.3):
if parsers.is_highlink_density(paragraph):
continue

word_stats = self.stopwords_class(
language=self.language
).get_stopword_count(text)
word_stats = self.self.stopwords.get_stopword_count(text)

if word_stats.stop_word_count > baseline_score * score_weight:
element = parsers.create_element(tag="p", text=text)
Expand Down
12 changes: 0 additions & 12 deletions newspaper/extractors/content_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,6 @@ class ContentExtractor:
def __init__(self, config: Configuration):
self.config = config
self.language = config.language
self.stopwords_class = config.stopwords_class
self.title_extractor = TitleExtractor(config)
self.author_extractor = AuthorsExtractor(config)
self.pubdate_extractor = PubdateExtractor(config)
Expand All @@ -32,17 +31,6 @@ def __init__(self, config: Configuration):
self.image_extractor = ImageExtractor(config)
self.video_extractor = VideoExtractor(config)

def update_language(self, meta_lang: str):
"""Required to be called before the extraction process in some
cases because the stopwords_class has to set in case the lang
is not latin based
"""
if meta_lang:
self.language = meta_lang
self.stopwords_class = self.config.get_stopwords_class(meta_lang)
self.atricle_body_extractor.stopwords_class = self.stopwords_class
self.atricle_body_extractor.language = self.language

def get_authors(self, doc: lxml.html.Element) -> List[str]:
"""Fetch the authors of the article, return as a list
Only works for english articles
Expand Down
Empty file added newspaper/language/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions newspaper/language/ar.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
def tokenizer(text):
import nltk

s = nltk.stem.isri.ISRIStemmer()
words = nltk.tokenize.wordpunct_tokenize(text)
words = [s.stem(word) for word in words]
return words
12 changes: 12 additions & 0 deletions newspaper/language/ja.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
try:
import tinysegmenter
except ImportError as e:
raise ImportError(
"You must install tinysegmenter before using the Japapnezes tokenizer. \n"
"Try pip install tinysegmenter\n"
"or pip install newspaper3k[zh]\n"
"or pip install newspaper3k[all]\n"
) from e

segmenter = tinysegmenter.TinySegmenter()
tokenizer = segmenter.tokenize
14 changes: 14 additions & 0 deletions newspaper/language/ko.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from nltk import word_tokenize

tokenizer = word_tokenize


def find_stopwords(tokens, stopwords):
res = []
for w in tokens:
for s in stopwords:
if w.endswith(s):
res.append(w)
break

return res
11 changes: 11 additions & 0 deletions newspaper/language/th.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
try:
import pythainlp
except ImportError as e:
raise ImportError(
"You must install pythainlp before using the Thai tokenizer. \n"
"Try pip install pythainlp\n"
"or pip install newspaper3k[th]\n"
"or pip install newspaper3k[all]\n"
) from e

tokenizer = pythainlp.word_tokenize
11 changes: 11 additions & 0 deletions newspaper/language/zh.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
try:
import jieba
except ImportError as e:
raise ImportError(
"You must install jieba before using the Chinese tokenizer. \n"
"Try pip install jieba\n"
"or pip install newspaper3k[zh]\n"
"or pip install newspaper3k[all]\n"
) from e

tokenizer = lambda x: jieba.cut(x, cut_all=True) # noqa: E731
Loading

0 comments on commit 0833859

Please sign in to comment.