Skip to content

Commit

Permalink
Fix the mypy issues
Browse files Browse the repository at this point in the history
  • Loading branch information
palfrey committed Mar 2, 2024
1 parent 47504a9 commit f700f9b
Show file tree
Hide file tree
Showing 10 changed files with 30 additions and 17 deletions.
3 changes: 2 additions & 1 deletion newspaper/extractors/articlebody_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from statistics import mean
from typing import Optional
import lxml
from newspaper.configuration import Configuration
import newspaper.extractors.defines as defines
import newspaper.parsers as parsers
from newspaper.text import StopWords
Expand All @@ -25,7 +26,7 @@


class ArticleBodyExtractor:
def __init__(self, config):
def __init__(self, config: Configuration):
self.config = config
self.top_node = None
self.top_node_complemented = None
Expand Down
21 changes: 10 additions & 11 deletions newspaper/nlp.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,32 +31,31 @@ def keywords(text: str, stopwords: StopWords, max_keywords: Optional[int] = None
Returns:
dict: The top 10 keywords and their frequency scores.
"""
text = list(stopwords.tokenizer(text))
tokenised_text = list(stopwords.tokenizer(text))
if not text:
return dict()
# of words before removing blacklist words
num_words = len(text) or 1
text = filter(lambda x: x not in stopwords.stop_words, text)
num_words = len(tokenised_text) or 1
tokenised_text = list(
filter(lambda x: x not in stopwords.stop_words, tokenised_text)
)

freq = Counter(text)
freq = Counter(tokenised_text)

keywords_ = freq.most_common(max_keywords)
keywords_dict = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

keywords_ = {k: v * 1.5 / num_words + 1 for k, v in keywords_}

return keywords_
return keywords_dict


def summarize(
title: str, text: str, stopwords: StopWords, max_sents: Optional[int] = 5
):
def summarize(title: str, text: str, stopwords: StopWords, max_sents: int = 5):
"""Summarize an article into the most relevant sentences in the article.
Args:
title (str): the article title
text (str): article contents
stopwords (StopWords): stopwords object for the language of the text
max_sents (Optional[int], optional):maximum number of sentences to
max_sents (int, optional):maximum number of sentences to
return in the summary. Sentences are weighted by their relevance
using the following criteria: sentence position, frequency of
keywords, title words found in the sentence, and sentence length.
Expand Down
2 changes: 1 addition & 1 deletion newspaper/parsers.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def create_element(tag, text=None, tail=None):

def remove(
nodes: Union[lxml.html.HtmlElement, List[lxml.html.HtmlElement]],
keep_tags: List[str] = None,
keep_tags: Optional[List[str]] = None,
):
"""Remove the node(s) from the tree
Arguments:
Expand Down
8 changes: 4 additions & 4 deletions newspaper/text.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@

from newspaper import settings

punctuation = {
punctuation_set = {
c for i in range(sys.maxunicode + 1) if category(c := chr(i)).startswith("P")
}
punctuation.update(string.punctuation)
punctuation_set.update(string.punctuation)
# remove characters used in contractions
contraction_separators = set("-'`ʹʻʼʽʾʿˈˊ‘’‛′‵Ꞌꞌ")
punctuation -= contraction_separators
punctuation: str = "".join(list(punctuation))
punctuation_set -= contraction_separators
punctuation: str = "".join(list(punctuation_set))
whitespace_tokenizer = WhitespaceTokenizer()


Expand Down
1 change: 1 addition & 0 deletions stubs/indicnlp/tokenize/indic_tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
def trivial_tokenize(text: str, lang: str = ...): ...
3 changes: 3 additions & 0 deletions stubs/jieba.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from typing import Union

def cut(sentence: Union[bytes, str], cut_all: bool = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ from typing import Optional
from . import data # noqa: F401

def download(info_or_id: Optional[str] = None): ...
def word_tokenize(text, language: str = ...): ...
1 change: 1 addition & 0 deletions stubs/nltk/tokenize.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
class WhitespaceTokenizer: ...
5 changes: 5 additions & 0 deletions stubs/pythainlp.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from typing import List

def word_tokenize(
text: str,
) -> List[str]: ...
2 changes: 2 additions & 0 deletions stubs/tinysegmenter.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
class TinySegmenter:
def tokenize(self, text: str): ...

0 comments on commit f700f9b

Please sign in to comment.