Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix 3.12 build #619

Merged
merged 6 commits into from
Mar 9, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .github/workflows/pipeline.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,10 +30,10 @@ jobs: # jobs. We will have two jobs (test and publish) with multiple steps.
python -m pip install --upgrade pip
poetry config virtualenvs.create false --local
poetry install --all-extras
pip install pytest pylint coverage mypy coveralls
pip install pylint coveralls
# python -m nltk.downloader punkt stopwords
env:
SETUPTOOLS_USE_DISTUTILS: stdlib
SETUPTOOLS_USE_DISTUTILS: local
- name: Pylint # Run pylint static analysis
run: |
poetry run pylint newspaper --fail-under=8.0
Expand Down
3 changes: 2 additions & 1 deletion newspaper/extractors/articlebody_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from statistics import mean
from typing import Optional
import lxml
from newspaper.configuration import Configuration
import newspaper.extractors.defines as defines
import newspaper.parsers as parsers
from newspaper.text import StopWords
Expand All @@ -25,7 +26,7 @@


class ArticleBodyExtractor:
def __init__(self, config):
def __init__(self, config: Configuration):
self.config = config
self.top_node = None
self.top_node_complemented = None
Expand Down
21 changes: 10 additions & 11 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,31 @@ def keywords(text: str, stopwords: StopWords, max_keywords: Optional[int] = None
Returns:
dict: The top 10 keywords and their frequency scores.
"""
text = list(stopwords.tokenizer(text))
tokenised_text = list(stopwords.tokenizer(text))
if not text:
return dict()
# of words before removing blacklist words
num_words = len(text) or 1
text = filter(lambda x: x not in stopwords.stop_words, text)
num_words = len(tokenised_text) or 1
tokenised_text = list(
filter(lambda x: x not in stopwords.stop_words, tokenised_text)
)

freq = Counter(text)
freq = Counter(tokenised_text)

keywords_ = freq.most_common(max_keywords)
keywords_dict = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

keywords_ = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

return keywords_
return keywords_dict


def summarize(
title: str, text: str, stopwords: StopWords, max_sents: Optional[int] = 5
):
def summarize(title: str, text: str, stopwords: StopWords, max_sents: int = 5):
"""Summarize an article into the most relevant sentences in the article.

Args:
title (str): the article title
text (str): article contents
stopwords (StopWords): stopwords object for the language of the text
max_sents (Optional[int], optional):maximum number of sentences to
max_sents (int, optional):maximum number of sentences to
return in the summary. Sentences are weighted by their relevance
using the following criteria: sentence position, frequency of
keywords, title words found in the sentence, and sentence length.
Expand Down
2 changes: 1 addition & 1 deletion newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def create_element(tag, text=None, tail=None):

def remove(
nodes: Union[lxml.html.HtmlElement, List[lxml.html.HtmlElement]],
keep_tags: List[str] = None,
keep_tags: Optional[List[str]] = None,
):
"""Remove the node(s) from the tree
Arguments:
Expand Down
8 changes: 4 additions & 4 deletions newspaper/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@

from newspaper import settings

punctuation = {
punctuation_set = {
c for i in range(sys.maxunicode + 1) if category(c := chr(i)).startswith("P")
}
punctuation.update(string.punctuation)
punctuation_set.update(string.punctuation)
# remove characters used in contractions
contraction_separators = set("-'`ʹʻʼʽʾʿˈˊ‘’‛′‵Ꞌꞌ")
punctuation -= contraction_separators
punctuation: str = "".join(list(punctuation))
punctuation_set -= contraction_separators
punctuation: str = "".join(list(punctuation_set))
whitespace_tokenizer = WhitespaceTokenizer()


Expand Down
353 changes: 237 additions & 116 deletions poetry.lock

Large diffs are not rendered by default.

13 changes: 9 additions & 4 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,14 @@ requests = ">=2.26.0"
feedparser = ">=6.0.0"
tldextract = ">=2.0.1"
python-dateutil = ">=2.6.1"
setuptools = "<60" # As per numpy recommendations at https://numpy.org/doc/stable/reference/distutils_status_migration.html#numpy-setuptools-interaction
numpy = [
{ version = ">=1.26", python = ">=3.9", optional = true},
{ version = "^1.24", python = ">=3.8, <3.9", optional = true}
]
pandas = [
{version = ">=2.1.0", optional = true, python = ">=3.9"},
{version = ">=2", optional = true, python = ">=3.8, <3.9"}
]

# Language specific dependencies
tinysegmenter = {version = ">=0.4", optional = true}
Expand All @@ -54,9 +61,6 @@ hi = ["indic-nlp-library"]
np = ["indic-nlp-library"]
ta = ["indic-nlp-library"]




[tool.poetry.group.dev.dependencies]
coverage = {version = ">=7.3.2", python = "^3.8"}
pre-commit = {version = ">=3.5.0", python = "^3.8"}
Expand All @@ -69,6 +73,7 @@ types-pillow = {version = "^10.2.0.20240213", python = "^3.8"}
types-python-dateutil = {version = "^2.8.19.20240106", python = "^3.8"}
types-requests = "^2.27.1"
types-beautifulsoup4 = {version = "^4.12.0.20240106", python = "^3.8"}
virtualenv = {version = ">=20.25.1"}

[tool.poetry.group.docs.dependencies]
sphinx = {version = ">=7.0.0", python = "^3.8"}
Expand Down
1 change: 1 addition & 0 deletions stubs/indicnlp/tokenize/indic_tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
def trivial_tokenize(text: str, lang: str = ...): ...
3 changes: 3 additions & 0 deletions stubs/jieba.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from typing import Union

def cut(sentence: Union[bytes, str], cut_all: bool = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ from typing import Optional
from . import data # noqa: F401

def download(info_or_id: Optional[str] = None): ...
def word_tokenize(text, language: str = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class WhitespaceTokenizer: ...
5 changes: 5 additions & 0 deletions stubs/pythainlp.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from typing import List

def word_tokenize(
text: str,
) -> List[str]: ...
2 changes: 2 additions & 0 deletions stubs/tinysegmenter.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class TinySegmenter:
def tokenize(self, text: str): ...
Loading