Skip to content

Commit

Permalink
refactor: added valid_languages function that returns available langu…
Browse files Browse the repository at this point in the history
…ages
  • Loading branch information
AndyTheFactory committed Jan 23, 2024
1 parent 9a48739 commit 3284732
Show file tree
Hide file tree
Showing 4 changed files with 25 additions and 9 deletions.
3 changes: 3 additions & 0 deletions newspaper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@
import logging
from logging import NullHandler
from .exceptions import ArticleBinaryDataException, ArticleException
from .languages import valid_languages


# Set default logging handler to avoid "No handler found" warnings.
logging.getLogger(__name__).addHandler(NullHandler())
Expand Down Expand Up @@ -69,6 +71,7 @@ def article(url: str, language: Optional[str] = "en", **kwargs) -> Article:
"fulltext",
"hot",
"languages",
"valid_languages",
"popular_urls",
"Config",
"Article",
Expand Down
14 changes: 14 additions & 0 deletions newspaper/languages/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from pathlib import Path
from typing import Optional

from newspaper import settings


languages_tuples = [
("aa", "Afar"),
Expand Down Expand Up @@ -223,3 +226,14 @@ def get_language_from_iso639_1(iso639_1: str) -> Optional[str]:
str: Language name (in english)
"""
return languages_dict.get(iso639_1)


def valid_languages():
"""Returns the List of available Languages"""
languages = []
for code, language in languages_tuples:
stopwords_file = Path(settings.STOPWORDS_DIR) / f"stopwords-{language}.txt"
if stopwords_file.exists():
languages.append((code, language))

return languages
6 changes: 3 additions & 3 deletions newspaper/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,13 @@ def __init__(self, language="en"):
self.tokenizer = default_tokenizer

if language not in self._cached_stop_words:
stopwordsFile = Path(settings.STOPWORDS_DIR) / f"stopwords-{language}.txt"
if not stopwordsFile.exists():
stopwords_file = Path(settings.STOPWORDS_DIR) / f"stopwords-{language}.txt"
if not stopwords_file.exists():
raise FileNotFoundError(
f"Stopwords file for language {language} not found! Make sure that "
"the language is supported (see `newspaper.languages()`)"
)
with open(stopwordsFile, "r", encoding="utf-8") as f:
with open(stopwords_file, "r", encoding="utf-8") as f:
self._cached_stop_words[language] = set(f.read().splitlines())

lang_module = Path(__file__).parent / "languages" / f"{language}.py"
Expand Down
11 changes: 5 additions & 6 deletions tests/test_languages.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pytest

import newspaper
from newspaper import nlp
from newspaper.article import Article
from newspaper.text import StopWords
from tests import conftest
Expand Down Expand Up @@ -104,12 +103,12 @@ def test_error_unknown_language(self):
with pytest.raises(ValueError):
_ = Article("http://www.cnn.com", language="zz")

@pytest.mark.skip(reason="valid_languages not implemented")
def test_stopwords_english(self, valid_language_fixture):
for lang in valid_language_fixture:
nlp.stopwords = set()
nlp.load_stopwords(lang)
assert len(nlp.stopwords) > 100
for lang, language_name in valid_language_fixture:
stopwords = StopWords(lang)
assert (
len(stopwords.stop_words) > 100
), f"Language {language_name} has too few stopwords"

def test_full_extract(self, language_article_fixture):
errors = []
Expand Down

0 comments on commit 3284732

Please sign in to comment.