diff --git a/docs/user_guide/assets/logo_v1_150.png b/docs/user_guide/assets/logo_v1_150.png new file mode 100644 index 0000000..eb6ae36 Binary files /dev/null and b/docs/user_guide/assets/logo_v1_150.png differ diff --git a/docs/user_guide/assets/logo_v1_670.png b/docs/user_guide/assets/logo_v1_670.png new file mode 100644 index 0000000..46210f5 Binary files /dev/null and b/docs/user_guide/assets/logo_v1_670.png differ diff --git a/newspaper/article.py b/newspaper/article.py index 89f7e86..b77017c 100755 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -17,6 +17,7 @@ import requests from newspaper.exceptions import ArticleException +from newspaper.text import StopWords from . import network from . import nlp @@ -489,9 +490,6 @@ def parse(self) -> "Article": if metadata["language"] in get_available_languages(): self.meta_lang = metadata["language"] - if self.config.use_meta_language: - self.extractor.update_language(self.meta_lang) - self.meta_site_name = metadata["site_name"] self.meta_description = metadata["description"] self.canonical_link = metadata["canonical_link"] @@ -616,9 +614,11 @@ def nlp(self): self.throw_if_not_downloaded_verbose() self.throw_if_not_parsed_verbose() - nlp.load_stopwords(self.config.language) - keywords = nlp.keywords(self.text, self.config.max_keywords) - for k, v in nlp.keywords(self.title, self.config.max_keywords).items(): + stopwords = StopWords(self.config.language) + keywords = nlp.keywords(self.text, stopwords, self.config.max_keywords) + for k, v in nlp.keywords( + self.title, stopwords, self.config.max_keywords + ).items(): if k in keywords: keywords[k] += v keywords[k] /= 2 @@ -634,7 +634,7 @@ def nlp(self): max_sents = self.config.max_summary_sent summary_sents = nlp.summarize( - title=self.title, text=self.text, max_sents=max_sents + title=self.title, text=self.text, stopwords=stopwords, max_sents=max_sents ) self.summary = "\n".join(summary_sents) diff --git a/newspaper/cli.py b/newspaper/cli.py index 6e1c05c..801fecc 100755 --- a/newspaper/cli.py +++ b/newspaper/cli.py @@ -46,7 +46,7 @@ def get_arparse() -> argparse.ArgumentParser: parser.add_argument( "--output-format", "-of", - choices=["csv", "json", "text"], + choices=["csv", "json", "text"], # TODO: add stdout as an option default="json", help="The output format of the parsed article.", ) diff --git a/newspaper/configuration.py b/newspaper/configuration.py index 2dba63e..9b6311a 100755 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -15,15 +15,6 @@ from newspaper.utils import get_available_languages -from .text import ( - StopWords, - StopWordsArabic, - StopWordsChinese, - StopWordsKorean, - StopWordsHindi, - StopWordsJapanese, - StopWordsThai, -) from .version import __version__ log = logging.getLogger(__name__) @@ -76,8 +67,6 @@ class Configuration: Default True. http_success_only (bool): if True, it will raise an :any:`ArticleException` if the html status_code is >= 400 (e.g. 404 page). default True. - stopwords_class (obj): unique stopword classes for oriental languages, - don't toggle requests_params (dict): Any of the params for the `get call`_ from ``requests`` library number_threads (int): number of threads to use for multi-threaded downloads @@ -181,9 +170,6 @@ def __init__(self): # English is the fallback self._language = "en" - # Unique stopword classes for oriental languages, don't toggle - self.stopwords_class = StopWords - # Params for get call from `requests` lib self.requests_params = { "timeout": 7, @@ -289,7 +275,6 @@ def language(self, value: str): # Set oriental language stopword class self._language = value - self.stopwords_class = self.get_stopwords_class(value) @property def use_meta_language(self): @@ -302,31 +287,6 @@ def use_meta_language(self): """ return self._use_meta_language - @staticmethod - def get_stopwords_class(language: str): - """Get the stopwords class for the given language. - Arguments: - language (str): The language for which it will return the StopWords object. - Returns: - class(StopWords): The stopwords class for the given language. - """ - if language == "ko": - return StopWordsKorean - elif language == "hi": - return StopWordsHindi - elif language == "zh": - return StopWordsChinese - # Persian and Arabic Share an alphabet - # There is a persian parser https://github.com/sobhe/hazm, - # but nltk is likely sufficient - elif language == "ar" or language == "fa": - return StopWordsArabic - elif language == "ja": - return StopWordsJapanese - elif language == "th": - return StopWordsThai - return StopWords - @property def MIN_WORD_COUNT(self): warn( diff --git a/newspaper/extractors/articlebody_extractor.py b/newspaper/extractors/articlebody_extractor.py index 215a125..4804432 100755 --- a/newspaper/extractors/articlebody_extractor.py +++ b/newspaper/extractors/articlebody_extractor.py @@ -4,6 +4,7 @@ import lxml import newspaper.extractors.defines as defines import newspaper.parsers as parsers +from newspaper.text import StopWords score_weights = { "start_boosting_score": 1.0, @@ -22,8 +23,6 @@ def __init__(self, config): self.config = config self.top_node = None self.top_node_complemented = None - self.stopwords_class = config.stopwords_class - self.language = config.language def parse(self, doc: lxml.html.Element): """_summary_ @@ -31,6 +30,7 @@ def parse(self, doc: lxml.html.Element): Args: doc (lxml.html.Element): _description_ """ + self.stopwords = StopWords(self.config.language) self.top_node = self.calculate_best_node(doc) self.top_node_complemented = self.complement_with_siblings(self.top_node) @@ -47,9 +47,8 @@ def calculate_best_node(self, doc): text_node = parsers.get_text(node) if not text_node: continue - word_stats = self.stopwords_class( - language=self.language - ).get_stopword_count(text_node) + + word_stats = self.stopwords.get_stopword_count(text_node) high_link_density = parsers.is_highlink_density(node) if word_stats.stop_word_count > 2 and not high_link_density: nodes_with_text.append(node) @@ -80,9 +79,7 @@ def calculate_best_node(self, doc): boost_score = score_weights["negative_score_boost"] text_node = parsers.get_text(node) - word_stats = self.stopwords_class( - language=self.language - ).get_stopword_count(text_node) + word_stats = self.stopwords.get_stopword_count(text_node) upscore = int(word_stats.stop_word_count + boost_score) parent_node = node.getparent() @@ -170,9 +167,7 @@ def is_boostable(self, node): if steps_away >= max_stepsaway_from_node: return False paragraph_text = parsers.get_text(current_node) - word_stats = self.stopwords_class( - language=self.language - ).get_stopword_count(paragraph_text) + word_stats = self.stopwords.get_stopword_count(paragraph_text) if word_stats.stop_word_count > minimum_stopword_count: return True steps_away += 1 @@ -288,9 +283,7 @@ def get_plausible_content(self, node, baseline_score, score_weight=0.3): if parsers.is_highlink_density(paragraph): continue - word_stats = self.stopwords_class( - language=self.language - ).get_stopword_count(text) + word_stats = self.self.stopwords.get_stopword_count(text) if word_stats.stop_word_count > baseline_score * score_weight: element = parsers.create_element(tag="p", text=text) diff --git a/newspaper/extractors/content_extractor.py b/newspaper/extractors/content_extractor.py index d7face7..686ecb8 100755 --- a/newspaper/extractors/content_extractor.py +++ b/newspaper/extractors/content_extractor.py @@ -22,7 +22,6 @@ class ContentExtractor: def __init__(self, config: Configuration): self.config = config self.language = config.language - self.stopwords_class = config.stopwords_class self.title_extractor = TitleExtractor(config) self.author_extractor = AuthorsExtractor(config) self.pubdate_extractor = PubdateExtractor(config) @@ -32,17 +31,6 @@ def __init__(self, config: Configuration): self.image_extractor = ImageExtractor(config) self.video_extractor = VideoExtractor(config) - def update_language(self, meta_lang: str): - """Required to be called before the extraction process in some - cases because the stopwords_class has to set in case the lang - is not latin based - """ - if meta_lang: - self.language = meta_lang - self.stopwords_class = self.config.get_stopwords_class(meta_lang) - self.atricle_body_extractor.stopwords_class = self.stopwords_class - self.atricle_body_extractor.language = self.language - def get_authors(self, doc: lxml.html.Element) -> List[str]: """Fetch the authors of the article, return as a list Only works for english articles diff --git a/newspaper/language/__init__.py b/newspaper/language/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/newspaper/language/ar.py b/newspaper/language/ar.py new file mode 100644 index 0000000..36d59c2 --- /dev/null +++ b/newspaper/language/ar.py @@ -0,0 +1,7 @@ +def tokenizer(text): + import nltk + + s = nltk.stem.isri.ISRIStemmer() + words = nltk.tokenize.wordpunct_tokenize(text) + words = [s.stem(word) for word in words] + return words diff --git a/newspaper/language/ja.py b/newspaper/language/ja.py new file mode 100644 index 0000000..eb3227a --- /dev/null +++ b/newspaper/language/ja.py @@ -0,0 +1,12 @@ +try: + import tinysegmenter +except ImportError as e: + raise ImportError( + "You must install tinysegmenter before using the Japapnezes tokenizer. \n" + "Try pip install tinysegmenter\n" + "or pip install newspaper3k[zh]\n" + "or pip install newspaper3k[all]\n" + ) from e + +segmenter = tinysegmenter.TinySegmenter() +tokenizer = segmenter.tokenize diff --git a/newspaper/language/ko.py b/newspaper/language/ko.py new file mode 100644 index 0000000..89631bb --- /dev/null +++ b/newspaper/language/ko.py @@ -0,0 +1,14 @@ +from nltk import word_tokenize + +tokenizer = word_tokenize + + +def find_stopwords(tokens, stopwords): + res = [] + for w in tokens: + for s in stopwords: + if w.endswith(s): + res.append(w) + break + + return res diff --git a/newspaper/language/th.py b/newspaper/language/th.py new file mode 100644 index 0000000..dd80056 --- /dev/null +++ b/newspaper/language/th.py @@ -0,0 +1,11 @@ +try: + import pythainlp +except ImportError as e: + raise ImportError( + "You must install pythainlp before using the Thai tokenizer. \n" + "Try pip install pythainlp\n" + "or pip install newspaper3k[th]\n" + "or pip install newspaper3k[all]\n" + ) from e + +tokenizer = pythainlp.word_tokenize diff --git a/newspaper/language/zh.py b/newspaper/language/zh.py new file mode 100644 index 0000000..ffbf5d9 --- /dev/null +++ b/newspaper/language/zh.py @@ -0,0 +1,11 @@ +try: + import jieba +except ImportError as e: + raise ImportError( + "You must install jieba before using the Chinese tokenizer. \n" + "Try pip install jieba\n" + "or pip install newspaper3k[zh]\n" + "or pip install newspaper3k[all]\n" + ) from e + +tokenizer = lambda x: jieba.cut(x, cut_all=True) # noqa: E731 diff --git a/newspaper/nlp.py b/newspaper/nlp.py index 3217a2f..e207ed5 100755 --- a/newspaper/nlp.py +++ b/newspaper/nlp.py @@ -5,150 +5,188 @@ """ Functions needed for the NLP analysis of articles. """ - - import os import re import math -from pathlib import Path from collections import Counter -from typing import List, Set +from typing import List, Optional + +from newspaper.text import StopWords from . import settings -stopwords: Set[str] = set() +def keywords(text: str, stopwords: StopWords, max_keywords: Optional[int] = None): + """Get the top 10 keywords and their frequency scores ignores + words in stopword list, counts the number of occurrences of each word, and + sorts them in descending by number of occurrences. The frequency scores + are normlized to the range [0, 1], and then multiplied by 1.5 to boost -def load_stopwords(language): - """ - Loads language-specific stopwords for keyword selection - """ - # stopwords for nlp in English are not the regular stopwords - # to pass the tests - # can be changed with the tests - - stopwordsFile = Path(settings.STOPWORDS_DIR) / f"stopwords-{language}.txt" - if not stopwordsFile.exists(): - raise ValueError( - f"Language {language} is not supported " - "(or make sure the stopwords file is present in " - "{settings.STOPWORDS_DIR}), please use one of " - "the following: {settings.languages}" - ) - with open(stopwordsFile, "r", encoding="utf-8") as f: - stopwords.update(set([w.strip() for w in f.readlines()])) - - -def keywords(text, max_keywords=None): - """Get the top 10 keywords and their frequency scores ignores blacklisted - words in stopwords, counts the number of occurrences of each word, and - sorts them in reverse natural order (so descending) by number of - occurrences. + Args: + text (str): The text to analyze. + stopwords (StopWords): A StopWords object for the language of the text. + max_keywords (int): The maximum number of keywords returned. defaults + to None, which returns all keywords. + + Returns: + dict: The top 10 keywords and their frequency scores. """ - # TODO: parametrable number of keywords - text = split_words(text) + text = list(stopwords.tokenizer(text)) if not text: return dict() # of words before removing blacklist words - num_words = len(text) - text = [x for x in text if x not in stopwords] - freq = {} - for word in text: - if word in freq: - freq[word] += 1 - else: - freq[word] = 1 - - keywords_ = sorted(freq.items(), key=lambda x: (x[1], x[0]), reverse=True) - if max_keywords: - keywords_ = keywords_[:max_keywords] - keywords_ = dict((x, y) for x, y in keywords_) - - for k in keywords_: - articleScore = keywords_[k] * 1.0 / max(num_words, 1) - keywords_[k] = articleScore * 1.5 + 1 - return dict(keywords_) - - -def summarize(url="", title="", text="", max_sents=5): + num_words = len(text) or 1 + text = filter(lambda x: x not in stopwords.stop_words, text) + + freq = Counter(text) + + keywords_ = freq.most_common(max_keywords) + + keywords_ = {k: v * 1.5 / num_words + 1 for k, v in keywords_} + + return keywords_ + + +def summarize( + title: str, text: str, stopwords: StopWords, max_sents: Optional[int] = 5 +): + """Summarize an article into the most relevant sentences in the article. + + Args: + title (str): the article title + text (str): article contents + stopwords (StopWords): stopwords object for the language of the text + max_sents (Optional[int], optional):maximum number of sentences to + return in the summary. Sentences are weighted by their relevance + using the following criteria: sentence position, frequency of + keywords, title words found in the sentence, and sentence length. + Defaults to 5. + + Returns: + _type_: _description_ + """ if not text or not title or max_sents <= 0: return [] summaries = [] sentences = split_sentences(text) - keys = keywords(text, settings.SUMMARIZE_KEYWORD_COUNT) - titleWords = split_words(title) + keys = keywords(text, stopwords, settings.SUMMARIZE_KEYWORD_COUNT) + title_words = list(stopwords.tokenizer(title)) # Score sentences, and use the top 5 or max_sents sentences - ranks = score(sentences, titleWords, keys).most_common(max_sents) - for rank in ranks: - summaries.append(rank[0]) - summaries.sort(key=lambda summary: summary[0]) + ranks = scored_sentences(sentences, title_words, keys, stopwords) + + # Filter out the first max_sents relevant sentences + summaries = ranks[:max_sents] + summaries.sort(key=lambda x: x[0]) # Sort my sentence order in the text return [summary[1] for summary in summaries] -def score(sentences, titleWords, keywords): +def title_score(title_tokens, sentence_tokens, stopwords): + title_tokens = [x for x in title_tokens if x not in stopwords.stop_words] + count = 0.0 + + if not title_tokens: + return count + + intersection = [ + word + for word in sentence_tokens + if word in title_tokens and word not in stopwords.stop_words + ] + return len(intersection) / len(title_tokens) + + +def scored_sentences(sentences, title_words, keywords, stopwords): """Score sentences based on different features""" - senSize = len(sentences) - ranks = Counter() + sentence_count = len(sentences) + ranks = [] + for i, s in enumerate(sentences): - sentence = split_words(s) - titleFeature = title_score(titleWords, sentence) - sentenceLength = length_score(len(sentence)) - sentencePosition = sentence_position(i + 1, senSize) - sbsFeature = sbs(sentence, keywords) - dbsFeature = dbs(sentence, keywords) - frequency = (sbsFeature + dbsFeature) / 2.0 * 10.0 + sentence = list(stopwords.tokenizer(s)) + title_features = title_score(title_words, sentence, stopwords) + sent_len = length_score(len(sentence)) + sent_pos = sentence_position_score(i + 1, sentence_count) + sbs_feature = sbs(sentence, keywords) + dbs_feature = dbs(sentence, keywords) + frequency = (sbs_feature + dbs_feature) / 2.0 * 10.0 # Weighted average of scores from four categories totalScore = ( - titleFeature * 1.5 - + frequency * 2.0 - + sentenceLength * 1.0 - + sentencePosition * 1.0 + title_features * 1.5 + frequency * 2.0 + sent_len * 1.0 + sent_pos * 1.0 ) / 4.0 - ranks[(i, s)] = totalScore + ranks.append((i, s, totalScore)) + + ranks.sort(key=lambda x: x[2], reverse=True) return ranks +def length_score(sentence_len): + return ( + 1 + - math.fabs(settings.MEAN_SENTENCE_LEN - sentence_len) + / settings.MEAN_SENTENCE_LEN + ) + + +def sentence_position_score(i, size): + """Different sentence positions indicate different + probability of being an important sentence. + """ + normalized = i * 1.0 / size + + ranges = [ + (1.0, 0), + (0.9, 0.15), + (0.8, 0.04), + (0.7, 0.04), + (0.6, 0.06), + (0.5, 0.04), + (0.4, 0.05), + (0.3, 0.08), + (0.2, 0.14), + (0.1, 0.23), + (0, 0.17), + ] + + for r, value in ranges: + if normalized > r: + return value + + return 0 + + def sbs(words, keywords): score = 0.0 - if len(words) == 0: - return 0 - for word in words: - if word in keywords: - score += keywords[word] - return (1.0 / math.fabs(len(words)) * score) / 10.0 + if not words or not keywords: + return score + + scores = [keywords.get(w, 0) for w in words] + score = sum(scores) / len(words) + score /= 10.0 + return score def dbs(words, keywords): - if len(words) == 0: + if not words or not keywords: return 0 + summ = 0 - first = [] - second = [] - - for i, word in enumerate(words): - if word in keywords: - score = keywords[word] - if first == []: - first = [i, score] - else: - second = first - first = [i, score] - dif = first[0] - second[0] - summ += (first[1] * second[1]) / (dif**2) - # Number of intersections - k = len(set(keywords.keys()).intersection(set(words))) + 1 - return 1 / (k * (k + 1.0)) * summ + words_in_keys = [ + (i, keywords[word], word) for i, word in enumerate(words) if word in keywords + ] + if not words_in_keys: + return 0 + intersection = set() + for first, second in zip(words_in_keys, words_in_keys[1:]): + dif = second[0] - first[0] + summ += (first[1] * second[1]) / (dif**2) + intersection.add(first[2]) -def split_words(text): - """Split a string into array of words""" - try: - text = re.sub(r"[^\w ]", "", text) # strip special chars - return [x.strip(".").lower() for x in text.split()] - except TypeError: - return None + intersection.add(words_in_keys[-1][2]) + # Number of intersections + k = len(intersection) + 1 + return 1 / (k * (k + 1.0)) * summ def split_sentences(text: str) -> List[str]: @@ -181,54 +219,3 @@ def split_sentences(text: str) -> List[str]: sentences = tokenizer.tokenize(text) sentences = [re.sub("[\n ]+", " ", x) for x in sentences if len(x) > 10] return sentences - - -def length_score(sentence_len): - return ( - 1 - - math.fabs(settings.MEAN_SENTENCE_LEN - sentence_len) - / settings.MEAN_SENTENCE_LEN - ) - - -def title_score(title, sentence): - if title: - title = [x for x in title if x not in stopwords] - count = 0.0 - for word in sentence: - if word not in stopwords and word in title: - count += 1.0 - return count / max(len(title), 1) - else: - return 0 - - -def sentence_position(i, size): - """Different sentence positions indicate different - probability of being an important sentence. - """ - normalized = i * 1.0 / size - if normalized > 1.0: - return 0 - elif normalized > 0.9: - return 0.15 - elif normalized > 0.8: - return 0.04 - elif normalized > 0.7: - return 0.04 - elif normalized > 0.6: - return 0.06 - elif normalized > 0.5: - return 0.04 - elif normalized > 0.4: - return 0.05 - elif normalized > 0.3: - return 0.08 - elif normalized > 0.2: - return 0.14 - elif normalized > 0.1: - return 0.23 - elif normalized > 0: - return 0.17 - else: - return 0 diff --git a/newspaper/text.py b/newspaper/text.py index 10e441e..5116a98 100755 --- a/newspaper/text.py +++ b/newspaper/text.py @@ -1,19 +1,30 @@ # -*- coding: utf-8 -*- # Much of the code here was forked from https://github.com/codelucas/newspaper # Copyright (c) Lucas Ou-Yang (codelucas) - """ Stopword extraction and stopword classes. """ - +import sys +from unicodedata import category from dataclasses import dataclass, field from pathlib import Path import re import string from typing import Dict, List +from nltk.tokenize import WhitespaceTokenizer from newspaper import settings +punctuation = { + c for i in range(sys.maxunicode + 1) if category(c := chr(i)).startswith("P") +} +punctuation.update(string.punctuation) +# remove characters used in contractions +contraction_separators = set("-'`ʹʻʼʽʾʿˈˊ‘’‛′‵Ꞌꞌ") +punctuation -= contraction_separators +punctuation: str = "".join(list(punctuation)) +whitespace_tokenizer = WhitespaceTokenizer() + def innerTrim(value): if isinstance(value, str): @@ -24,6 +35,27 @@ def innerTrim(value): return "" +def default_tokenizer(text): + if isinstance(text, bytes): + text = text.decode("utf-8", "replace") + # Remove punctuation + text = text.translate( + str.maketrans( + punctuation, + " " * len(punctuation), + ) + ) + # remove multiple contraction separators + regex_str = re.escape("".join(contraction_separators)) + text = re.sub( + rf"(?<=\W)[{regex_str}]|[{regex_str}](?=\W)|" + f"^[{regex_str}]*|[{regex_str}]*$|[{regex_str}]{{2,}}", + " ", + text, + ) + return whitespace_tokenizer.tokenize(text.lower()) + + @dataclass class WordStats: """Holds the number of stop words and total words in an article""" @@ -34,10 +66,12 @@ class WordStats: class StopWords: - TRANS_TABLE = str.maketrans("", "") _cached_stop_words: Dict[str, str] = {} def __init__(self, language="en"): + self.find_stopwords = None + self.tokenizer = default_tokenizer + if language not in self._cached_stop_words: stopwordsFile = Path(settings.STOPWORDS_DIR) / f"stopwords-{language}.txt" if not stopwordsFile.exists(): @@ -48,148 +82,39 @@ def __init__(self, language="en"): with open(stopwordsFile, "r", encoding="utf-8") as f: self._cached_stop_words[language] = set(f.read().splitlines()) - self.STOP_WORDS = self._cached_stop_words[language] - - def remove_punctuation(self, content): - # code taken form - # http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python - content_is_unicode = isinstance(content, str) - if content_is_unicode: - content = content.encode("utf-8") - trans_table = {ord(c): None for c in string.punctuation} - stripped_input = content.decode("utf-8").translate(trans_table) - - return stripped_input - - def candidate_words(self, stripped_input): - return stripped_input.split(" ") - - def get_stopword_count(self, content): - if not content: - return WordStats() - ws = WordStats() - stripped_input = self.remove_punctuation(content) - candidate_words = self.candidate_words(stripped_input.lower()) - overlapping_stopwords = [] - c = 0 - for w in candidate_words: - c += 1 - if w in self.STOP_WORDS: - overlapping_stopwords.append(w) - - ws.word_count = c - ws.stop_word_count = len(overlapping_stopwords) - ws.stop_words = overlapping_stopwords - return ws - - -class StopWordsChinese(StopWords): - """Chinese segmentation""" - - def __init__(self, language="zh"): - super(StopWordsChinese, self).__init__(language="zh") - - def candidate_words(self, stripped_input): - # jieba builds a tree that takes a while. avoid building - # this tree if we don't use the chinese language - import jieba - - return jieba.cut(stripped_input, cut_all=True) - - -class StopWordsArabic(StopWords): - """Arabic segmentation""" - - def __init__(self, language="ar"): - # force ar language code - super(StopWordsArabic, self).__init__(language="ar") + lang_module = Path(__file__).parent / "language" / f"{language}.py" + if lang_module.exists(): + import importlib - def remove_punctuation(self, content): - return content - - def candidate_words(self, stripped_input): - import nltk - - s = nltk.stem.isri.ISRIStemmer() - words = [] - for word in nltk.tokenize.wordpunct_tokenize(stripped_input): - words.append(s.stem(word)) - return words - - -class StopWordsKorean(StopWords): - """Korean segmentation""" - - def __init__(self, language="ko"): - super(StopWordsKorean, self).__init__(language="ko") - - def get_stopword_count(self, content): - if not content: - return WordStats() - ws = WordStats() - stripped_input = self.remove_punctuation(content) - candidate_words = self.candidate_words(stripped_input) - overlapping_stopwords = [] - c = 0 - for w in candidate_words: - c += 1 - for s in self.STOP_WORDS: - if w.endswith(s): - overlapping_stopwords.append(w) - - ws.word_count = c - ws.stop_word_count = len(overlapping_stopwords) - ws.stop_words = overlapping_stopwords - return ws + module = importlib.import_module(f"newspaper.language.{language}") + if not hasattr(module, "tokenizer"): + raise ValueError( + f"Language module {lang_module} has no tokenizer function!" + ) + if hasattr(module, "find_stopwords"): + self.find_stopwords = module.find_stopwords -class StopWordsHindi(StopWords): - """Hindi segmentation""" + self.tokenizer = module.tokenizer - def __init__(self, language="hi"): - super(StopWordsHindi, self).__init__(language="hi") + self.stop_words = self._cached_stop_words[language] def get_stopword_count(self, content): if not content: return WordStats() - ws = WordStats() - stripped_input = self.remove_punctuation(content) - candidate_words = self.candidate_words(stripped_input) - overlapping_stopwords = [] - c = 0 - for w in candidate_words: - c += 1 - for stop_word in self.STOP_WORDS: - overlapping_stopwords.append(stop_word) - - ws.word_count = c - ws.stop_word_count = len(overlapping_stopwords) - ws.stop_words = overlapping_stopwords - return ws - - -class StopWordsJapanese(StopWords): - """Japanese segmentation""" - - def __init__(self, language="ja"): - super(StopWordsJapanese, self).__init__(language="ja") - - def candidate_words(self, stripped_input): - import tinysegmenter - - segmenter = tinysegmenter.TinySegmenter() - tokens = segmenter.tokenize(stripped_input) - return tokens - - -class StopWordsThai(StopWords): - """Thai segmentation""" - - def __init__(self, language="th"): - super(StopWordsThai, self).__init__(language="th") - - def candidate_words(self, stripped_input): - import pythainlp - tokens = pythainlp.word_tokenize(stripped_input) - return tokens + tokens = list(self.tokenizer(content)) + + if self.find_stopwords: + # some special way stopwords are identified. + # Not as full string. Korean seems work based on tokens ending + # with the stopword (as if it's a suffix) TODO: confirm this + intersection = self.find_stopwords(tokens, self.stop_words) + else: + intersection = [w for w in tokens if w in self.stop_words] + + return WordStats( + stop_word_count=len(intersection), + word_count=len(tokens), + stop_words=intersection, + ) diff --git a/tests/data/metadata/arabic_article.json b/tests/data/metadata/arabic_article.json new file mode 100644 index 0000000..d2740c0 --- /dev/null +++ b/tests/data/metadata/arabic_article.json @@ -0,0 +1,74 @@ +{ + "url": "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html", + "read_more_link": "", + "language": "ar", + "title": "معارضون يسيطرون على مخازن للصواريخ بريف دمشق", + "top_image": "", + "meta_img": "http://i.cdn.turner.com/cnn/arabic/2013/middle_east/8/3/syria.clashes/Gal.syria.assad.army.jpg_-1_-1.jpg", + "images": [ + "http://i.cdn.turner.com/cnn/images/1.gif", + "http://arabic.cnn.com/.element/img/2.0/hdr-globe-central.gif", + "http://arabic.cnn.com/images/icons/facebook_icon.png", + "http://arabic.cnn.com/images/icons/twitter_icon.png", + "http://arabic.cnn.com/images/icons/rss_icon.png", + "http://arabic.cnn.com/images/icons/youtube.icon.jpg", + "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/Gal.syria.assad.army.jpg_-1_-1.jpg", + "http://arabic.cnn.com/.element/img/2.0/global/misc/advertisement.gif", + "http://arabic.cnn.com/.element/img/2.0/global/misc/advertisement.gif", + "http://i.cdn.turner.com/cnn/.element/img/3.0/1px.gif", + "http://i.cdn.turner.com/cnn/.element/img/3.0/1px.gif", + "http://i.cdn.turner.com/cnn/.element/img/3.0/global/footer/pngs/footer_cnn_logo.png" + ], + "movies": [], + "keywords": [ + "خزن", + "عرض", + "دمشق", + "يطر", + "برف", + "صواريخ", + "\"", + "سور", + "قتل", + "كتب", + "-", + "سلم", + ".", + "سلح", + "بلد", + "نطق", + "طرف", + "فصل", + "ئلف", + "كرد", + "شمل", + "رصد", + "شبك", + "شعب", + "دين", + "(", + ")", + "نظم", + "بشر", + "بـ", + "لحر", + "ودع", + "ضاد", + "درع", + "وقت" + ], + "meta_keywords": [ + "" + ], + "tags": null, + "authors": [], + "publish_date": null, + "summary": "دمشق، سوريا (CNN) -- أكدت جهات سورية معارضة أن فصائل مسلحة معارضة لنظام الرئيس بشار الأسد وعلى صلة بـ\"الجيش الحر\" تمكنت من السيطرة على مستودعات للأسلحة بريف دمشق تضم كميات من الصواريخ ومضادات الدروع، في الوقت الذي حض فيه الائتلاف الوطني السوري المعارض الفصائل الكردية والإسلامية المتقاتلة في شمالي البلاد إلى \"ضبط النفس.\"\nوقال المرصد السوري لحقوق الإنسان، وهو هيئة معارضة مقرها لندن، إن مقاتلين من لواء الاسلام - جبهة النصرة- كتيبة التوحيد- قوات المغاوير - كتائب شهداء القلمون، وعدة كتائب أخرى، سيطروا على ثلاثة مستودعات للذخيرة بالقرب من بلدة قلدون في منطقة القلمون بريف دمشق.\nوبحسب المرصد فقد اغتنم مقاتلو الكتائب المقاتلة أسلحة مضادة للدروع وصواريخ أرض- أرض (غراد) وذخائر أخرى متنوعة, كما تجددت الاشتباكات بين مقاتلين من الكتائب المقاتلة من طرف والقوات النظامية ومسلحين من اللجان الشعبية التابعة لها من الطائفة الشيعية من طرف آخر في منطقة السيدة زينب.\nولم ترد تقارير حول الخسائر البشرية، في في حين دارت اشتباكات عنيفة بين الطرفين في وقت متأخر من ليل الجمعة، في قرية التويمية، الواقعة بين منطقة أصفر ونجار وقرية مشرافة في جنوب مدينة راس العين، إثر محاولة مقاتلي الجبهة و\"الدولة الإسلامية\" التقدم باتجاه المدينة.\nأما الائتلاف الوطني السوري المعارض، فقد دعا في بيان له كافة الكتائب والفصائل المقاتلة في الشمال السوري إلى \"ضرورة الوعي بأهمية المرحلة الراهنة، وبضبط النفس والتحلي بالحكمة لضمان سلامة المدنيين وإخلاء سبيل أي أشخاص موقوفين أو معتقلين.\"", + "meta_description": "أكدت جهات سورية معارضة أن فصائل مسلحة معارضة لنظام الرئيس بشار الأسد وعلى صلة بـ\"الجيش الحر\" تمكنت من السيطرة على مستودعات للأسلحة بريف دمشق تضم كميات، في الوقت الذي حض فيه الائتلاف الوطني السوري المعارض الفصائل الكردية والإسلامية المتقاتلة في شمالي البلاد إلى \"ضبط النفس.\"", + "meta_lang": "ar", + "meta_favicon": "", + "meta_site_name": "CNNArabic", + "canonical_link": "http://arabic.cnn.com/2013/middle_east/8/3/syria.clashes/index.html", + "text": "دمشق، سوريا (CNN) -- أكدت جهات سورية معارضة أن فصائل مسلحة معارضة لنظام الرئيس بشار الأسد وعلى صلة بـ\"الجيش الحر\" تمكنت من السيطرة على مستودعات للأسلحة بريف دمشق تضم كميات من الصواريخ ومضادات الدروع، في الوقت الذي حض فيه الائتلاف الوطني السوري المعارض الفصائل الكردية والإسلامية المتقاتلة في شمالي البلاد إلى \"ضبط النفس.\"\n\nوقال المرصد السوري لحقوق الإنسان، وهو هيئة معارضة مقرها لندن، إن مقاتلين من لواء الاسلام - جبهة النصرة- كتيبة التوحيد- قوات المغاوير - كتائب شهداء القلمون، وعدة كتائب أخرى، سيطروا على ثلاثة مستودعات للذخيرة بالقرب من بلدة قلدون في منطقة القلمون بريف دمشق.\n\nوبحسب المرصد فقد اغتنم مقاتلو الكتائب المقاتلة أسلحة مضادة للدروع وصواريخ أرض- أرض (غراد) وذخائر أخرى متنوعة, كما تجددت الاشتباكات بين مقاتلين من الكتائب المقاتلة من طرف والقوات النظامية ومسلحين من اللجان الشعبية التابعة لها من الطائفة الشيعية من طرف آخر في منطقة السيدة زينب.\n\nوفي محافظة الحسكة شمال شرقي البلاد، أفاد المرصد عن اشتباكات دارت بعد منتصف ليل الجمعة - السبت، في محيط بلدة تل حلف قرب مدينة رأس العين بين و\"حدات حماية الشعب\" الكردية، ومقاتلي ما يعرف بـ\"الدولة الإسلامية في العراق والشام\" وجبهة النصرة وبعض الكتائب المقاتلة من طرف آخر.\n\nولم ترد تقارير حول الخسائر البشرية، في في حين دارت اشتباكات عنيفة بين الطرفين في وقت متأخر من ليل الجمعة، في قرية التويمية، الواقعة بين منطقة أصفر ونجار وقرية مشرافة في جنوب مدينة راس العين، إثر محاولة مقاتلي الجبهة و\"الدولة الإسلامية\" التقدم باتجاه المدينة.\n\nأما الائتلاف الوطني السوري المعارض، فقد دعا في بيان له كافة الكتائب والفصائل المقاتلة في الشمال السوري إلى \"ضرورة الوعي بأهمية المرحلة الراهنة، وبضبط النفس والتحلي بالحكمة لضمان سلامة المدنيين وإخلاء سبيل أي أشخاص موقوفين أو معتقلين.\"\n\nوشدد الائتلاف على \"ضرورة الابتعاد عن الأعمال الاستفزازية بكافة أشكالها، ويحذر كل من يستغل المرحلة الراهنة لتطبيق أجندات سياسية، وترك القرار للشعب السوري الحر ليختار مصيره بملء إرادته\" في بيان يأتي بالترافق مع الحديث عن كون تلك المواجهات مقدمة لولادة حكومة تدير المناطق التي يقطنها الأكراد في سوريا بشكل مستقل.", + "text_cleaned": "دمشق، سوريا (CNN) -- أكدت جهات سورية معارضة أن فصائل مسلحة معارضة لنظام الرئيس بشار الأسد وعلى صلة بـ\"الجيش الحر\" تمكنت من السيطرة على مستودعات للأسلحة بريف دمشق تضم كميات من الصواريخ ومضادات الدروع، في الوقت الذي حض فيه الائتلاف الوطني السوري المعارض الفصائل الكردية والإسلامية المتقاتلة في شمالي البلاد إلى \"ضبط النفس.\"\n\nوقال المرصد السوري لحقوق الإنسان، وهو هيئة معارضة مقرها لندن، إن مقاتلين من لواء الاسلام - جبهة النصرة- كتيبة التوحيد- قوات المغاوير - كتائب شهداء القلمون، وعدة كتائب أخرى، سيطروا على ثلاثة مستودعات للذخيرة بالقرب من بلدة قلدون في منطقة القلمون بريف دمشق.\n\nوبحسب المرصد فقد اغتنم مقاتلو الكتائب المقاتلة أسلحة مضادة للدروع وصواريخ أرض- أرض (غراد) وذخائر أخرى متنوعة, كما تجددت الاشتباكات بين مقاتلين من الكتائب المقاتلة من طرف والقوات النظامية ومسلحين من اللجان الشعبية التابعة لها من الطائفة الشيعية من طرف آخر في منطقة السيدة زينب.\n\nوفي محافظة الحسكة شمال شرقي البلاد، أفاد المرصد عن اشتباكات دارت بعد منتصف ليل الجمعة - السبت، في محيط بلدة تل حلف قرب مدينة رأس العين بين و\"حدات حماية الشعب\" الكردية، ومقاتلي ما يعرف بـ\"الدولة الإسلامية في العراق والشام\" وجبهة النصرة وبعض الكتائب المقاتلة من طرف آخر.\n\nولم ترد تقارير حول الخسائر البشرية، في في حين دارت اشتباكات عنيفة بين الطرفين في وقت متأخر من ليل الجمعة، في قرية التويمية، الواقعة بين منطقة أصفر ونجار وقرية مشرافة في جنوب مدينة راس العين، إثر محاولة مقاتلي الجبهة و\"الدولة الإسلامية\" التقدم باتجاه المدينة.\n\nأما الائتلاف الوطني السوري المعارض، فقد دعا في بيان له كافة الكتائب والفصائل المقاتلة في الشمال السوري إلى \"ضرورة الوعي بأهمية المرحلة الراهنة، وبضبط النفس والتحلي بالحكمة لضمان سلامة المدنيين وإخلاء سبيل أي أشخاص موقوفين أو معتقلين.\"\n\nوشدد الائتلاف على \"ضرورة الابتعاد عن الأعمال الاستفزازية بكافة أشكالها، ويحذر كل من يستغل المرحلة الراهنة لتطبيق أجندات سياسية، وترك القرار للشعب السوري الحر ليختار مصيره بملء إرادته\" في بيان يأتي بالترافق مع الحديث عن كون تلك المواجهات مقدمة لولادة حكومة تدير المناطق التي يقطنها الأكراد في سوريا بشكل مستقل." +} \ No newline at end of file diff --git a/tests/data/metadata/article_with_br.json b/tests/data/metadata/article_with_br.json index fa6ff26..40db776 100644 --- a/tests/data/metadata/article_with_br.json +++ b/tests/data/metadata/article_with_br.json @@ -32,41 +32,41 @@ ], "movies": [], "keywords": [ - "sheldon", - "julian", - "involved", "files", "expose", + "sheldon", + "involved", + "julian", "assange", "security", - "adelsons", + "adelson's", "team", "american", - "operation", "court", - "spy", + "operation", "friends", + "spy", "morales", "global", "uc", "embassy", "sands", "vegas", - "las", "cia", - "lahav", "intelligence", + "lahav", + "las", "surveillance", "spanish", "company", "grayzone", "spying", + "2017", "nagel", + "assange's", "ecuadorian", - "assanges", - "2017", - "wikileaks", - "director" + "director", + "wikileaks" ], "meta_keywords": [ "" @@ -74,7 +74,7 @@ "tags": null, "authors": [], "publish_date": null, - "summary": "During the spying operation, Lahav worked directly under Brian Nagel, the director of global security for Las Vegas Sands.\nIt would not be long before Adelson's security team began preparing space for Morales in Las Vegas.\n(Besides Lahav, the legal complaint named Adi Barshishat as an Israeli who helped direct Adelson's security team.\nOn a visit to the embassy, UC Global security photographed the contents of Müller-Maguhn's backpack and the contact numbers in his mobile phone.\nShe was representing Las Vegas Sands, a clear indication that Adelson was deeply concerned about the outcome of the proceedings.", + "summary": "Even the Ecuadorian diplomats whom UC Global was hired to protect were targeted by the spy ring.\nDuring the spying operation, Lahav worked directly under Brian Nagel, the director of global security for Las Vegas Sands.\n(Besides Lahav, the legal complaint named Adi Barshishat as an Israeli who helped direct Adelson's security team.\nOn a visit to the embassy, UC Global security photographed the contents of Müller-Maguhn's backpack and the contact numbers in his mobile phone.\nShe was representing Las Vegas Sands, a clear indication that Adelson was deeply concerned about the outcome of the proceedings.", "meta_description": "An exclusive investigation by The Grayzone reveals new details on the critical role Sheldon Adelson's Las Vegas Sands played in an apparent CIA spying operation targeting Julian Assange, and exposes the Sands security staff who helped coordinate...", "meta_lang": "en", "meta_favicon": "/apple-touch-icon-57x57.png", diff --git a/tests/data/metadata/chinese_article_001.json b/tests/data/metadata/chinese_article_001.json old mode 100755 new mode 100644 index 0c31123..0041357 --- a/tests/data/metadata/chinese_article_001.json +++ b/tests/data/metadata/chinese_article_001.json @@ -34,9 +34,41 @@ ], "movies": [], "keywords": [ - "外国专家中美合作共赢符合国际社会利益", - "王辉责任编辑严玉洁", - "在美国旧金山举行的中美元首会晤吸引了全世界的目光多名外国专家告诉中国日报中美合作共赢符合国际社会利益美方应当采取更多行动中美合作共赢是大势所趋美中合作基金会执行主席约翰米勒怀特认为共同繁荣还是互相毁灭这是两国面临的抉择众所周知21世纪的金融往来和军事技术发展让超级大国也无法在冲突中成为赢家瑞典一带一路研究所副所长侯赛因阿斯卡里指出作为全球最大的两个经济体中美关系几乎会影响每一个国家这是不可避免的各国在供应链和贸易运输上相互依存因此中美两个大国必须深化合作避免摩擦和直接对抗克罗地亚地缘经济论坛主席雅思娜佩拉夫妮表示元首会晤的举行本身就是一项重大成就此次会晤让人们对中美两个大国之间的合作前景更加乐观中国从未想过构建一个没有美国的多极世界秩序中美两国有许多共同之处比如实现联合国可持续发展目标美方应当采取更多行动巴基斯坦人类命运共同体研究中心主任哈立德泰穆尔阿克拉姆指出中美元首旧金山会晤为改善两国关系带来希望阿克拉姆认为媒体应该发挥更具建设性的作用中国媒体一贯致力于推动中美关系发展准确而乐观地报道中美合作与发展前景相较之下美西方媒体往往对中国采取敌对态度在他看来通过促进互利合作媒体可以为两国关系止跌企稳作出贡献这完全符合中美两国及其民众的利益亚太一带一路共策会会长翁诗杰表示国际社会高度关注中美如何改善双边关系并解决全球关注的问题他分析称美国曾以国家安全为幌子威胁脱钩和去风险近期又表示不寻求与中国脱钩美方需要作出更令人放心的承诺并采取更多令人安心的行动编辑严玉洁" + "外国", + "外国专家", + "专家", + ":", + "符合国际", + "社会", + "利益", + "", + "中美", + "\n", + "合作", + ",", + "中美合作", + "共", + "赢", + "符合", + "国际", + "。", + "中国", + "·", + "两国", + "“", + "关系", + "会晤", + "采取", + "发展", + "媒体", + "美国", + "元首", + "美方", + "更多", + "行动", + "共同", + "大国", + "”" ], "meta_keywords": [ "" diff --git a/tests/data/metadata/chinese_article_002.json b/tests/data/metadata/chinese_article_002.json old mode 100755 new mode 100644 index 736521b..4fec19a --- a/tests/data/metadata/chinese_article_002.json +++ b/tests/data/metadata/chinese_article_002.json @@ -9,20 +9,51 @@ "https://www.news.cn/detail2020/images/ewm.png", "https://www.news.cn/politics/newpage2020/images/qrcode-app.png", "https://www.news.cn/politics/newpage2020/images/qrcode-app.png", + "https://www.news.cn/images/syicon/space.gif", "http://www.news.cn/fortune/titlepic/1129978110_1700103707564_title1n.jpg", "http://www.news.cn/fortune/titlepic/1129980930_1700213925189_title1n.gif", "http://www.news.cn/fortune/titlepic/1129980707_1700209770225_title1n.jpg", "http://www.news.cn/fortune/titlepic/1129247472_1672470686379_title1n.png", "http://www.news.cn/fortune/titlepic/1129247789_1672490511085_title1n.png", - "http://www.news.cn/fortune/titlepic/1129243305_1672447096992_title1n.jpg", - "http://webd.home.news.cn/1.gif?z=1&_wdxid=010030090900000000000000011105701129981476" + "http://www.news.cn/fortune/titlepic/1129243305_1672447096992_title1n.jpg" ], "movies": [], "keywords": [ - "记者17日从国家卫生健康委获悉近阶段相关部门持续依托短缺药品供应保障工作会商联动机制深入优化短缺药品实时监测预警与分级应对体系全力保障人民群众临床用药需求据了解短缺药品信息监测共享正在多维度推进国家卫生健康委依托全国公立医疗卫生机构短缺药品信息直报系统开展短缺药品监测预警与分级应对工业和信息化部对临床必需易短缺药品的生产供应情况开展动态监测与分析预警国家药监局持续采集短缺药品生产供应及停产报告信息同时多环节发挥短缺药品清单抓手作用国家医保局密切监测国家短缺药品清单和临床必需易短缺药品重点监测清单药品价格和配送情况国家药监局加快短缺药品审评审批进度持续开展短缺药品基础信息标记和数据管理工业和信息化部国家卫生健康委积极推动尼可刹米洛贝林原料药复产和制剂生产此外多层次提升短缺药品应对处置水平多部门加强医药领域监管执法市场监管总局指导地方加快医药领域垄断线索核查与案件办理工作工业和信息化部提前应对秋冬季流感疫情组织生产企业增产扩能商务部分析研判汛情对药品供应链的影响指导药品流通企业加大药品供应力度国家邮政局持续推动邮政综合服务平台建设提高医院药品互联网医疗平台寄递服务能力国家中医药局积极协调相关部门行业协会等推动中药材供需总体平衡", - "新华社北京11月17日电", - "短缺药品保供稳价", - "全力保障临床用药需求" + "", + "保供", + "稳", + "价", + " ", + "全力", + "力保", + "临床用", + "用药", + "需求", + "药品", + "短缺", + "。", + "临床", + "保障", + ",", + "国家", + "信息", + "监测", + "\n", + "供应", + "卫生", + "持续", + "应对", + "生产", + "、", + "健康", + "委", + "部门", + "预警", + "开展", + "工业", + "信息化", + "部", + "清单" ], "meta_keywords": [ "短缺", @@ -34,7 +65,7 @@ "authors": [], "publish_date": "2023-11-17T00:00:00", "summary": "新华社北京11月17日电 记者17日从国家卫生健康委获悉,近阶段,相关部门持续依托短缺药品供应保障工作会商联动机制,深入优化短缺药品实时监测预警与分级应对体系,全力保障人民群众临床用药需求。 据了解,短缺药品信息监测共享正在多维度推进。国家卫生健康委依托全国公立医疗卫生机构短缺药品信息直报系统开展短缺药品监测预警与分级应对。工业和信息化部对临床必需易短缺药品的生产供应情况开展动态监测与分析预警。国家药监局持续采集短缺药品生产供应及停产报告信息。 同时,多环节发挥短缺药品清单抓手作用。国家医保局密切监测国家短缺药品清单和临床必需易短缺药品重点监测清单药品价格和配送情况。国家药监局加快短缺药品审评审批进度,持续开展短缺药品基础信息标记和数据管理。工业和信息化部、国家卫生健康委积极推动尼可刹米、洛贝林原料药复产和制剂生产。 此外,多层次提升短缺药品应对处置水平。多部门加强医药领域监管执法,市场监管总局指导地方加快医药领域垄断线索核查与案件办理工作。工业和信息化部提前应对秋冬季流感疫情,组织生产企业增产扩能。商务部分析研判汛情对药品供应链的影响,指导药品流通企业加大药品供应力度。国家邮政局持续推动邮政综合服务平台建设,提高医院药品、互联网医疗平台寄递服务能力。国家中医药局积极协调相关部门、行业协会等,推动中药材供需总体平衡。", - "meta_description": "短缺药品保供稳价 全力保障临床用药需求\r\n---记者17日从国家卫生健康委获悉,近阶段,相关部门持续依托短缺药品供应保障工作会商联动机制,深入优化短缺药品实时监测预警与分级应对体系,全力保障人民群众临床用药需求。", + "meta_description": "短缺药品保供稳价 全力保障临床用药需求\n---记者17日从国家卫生健康委获悉,近阶段,相关部门持续依托短缺药品供应保障工作会商联动机制,深入优化短缺药品实时监测预警与分级应对体系,全力保障人民群众临床用药需求。", "meta_lang": "", "meta_favicon": "", "meta_site_name": "", diff --git a/tests/data/metadata/cleveland.com1.json b/tests/data/metadata/cleveland.com1.json new file mode 100644 index 0000000..9956f88 --- /dev/null +++ b/tests/data/metadata/cleveland.com1.json @@ -0,0 +1,63 @@ +{ + "url": "http://www.cleveland.com/food/index.ssf/2014/12/let_me_google_that_for_you_201.html", + "read_more_link": "", + "language": "en", + "title": "Let me Google that for you: 2014's most popular food-related searches", + "top_image": "", + "meta_img": "http://imgick.oregonlive.com/home/olive-media/width620/img/today/photo/16470669-standard.jpg", + "images": [], + "movies": [ + "//www.youtube.com/embed/JOCtdw9FG-s" + ], + "keywords": [ + "2014's", + "popular", + "google", + "searches", + "food-related", + "searched", + "search", + "food", + "eat", + "hungry", + "tiny", + "jpg", + "chia", + "seeds", + "goji", + "berries", + "kale", + "quinoa", + "google's", + "recent", + "parsing", + "food-focused", + "giant", + "pours", + "fascinating", + "queries", + "interesting", + "data", + "points", + "include", + "pizza", + "world", + "cup", + "cronut", + "rose" + ], + "meta_keywords": [ + "" + ], + "tags": null, + "authors": [], + "publish_date": "2014-12-27T12:06:00", + "summary": "google.JPG Chia seeds and goji berries are the new kale and quinoa, according to Google's recent parsing of food-focused searches from 2014.\nEach year, the search giant pours through some of our more fascinating queries to come up with their Year in Search.\nSome of the other more interesting food-related data points include: Pizza was searched more than the World Cup.\nHungry folk in Australia searched Argentine food more than Argentina.\nIn 2014 'i am hungry' was searched a button-popping 7x more than 'i am thirsty.'", + "meta_description": "The lesson, as always: You are what you search.", + "meta_lang": "en", + "meta_favicon": "", + "meta_site_name": "cleveland.com", + "canonical_link": "http://www.cleveland.com/8003098", + "text": "google.JPG\n\nChia seeds and goji berries are the new kale and quinoa, according to Google's recent parsing of food-focused searches from 2014. Each year, the search giant pours through some of our more fascinating queries to come up with their Year in Search.\n\nSome of the other more interesting food-related data points include:\n\nPizza was searched more than the World Cup.\n\nThe Cronut rose to 17th on the global recipe list after its arrival last year.\n\nOur favorite ways to eat eggs are: 1) Deviled, 2) Scotch, 3) Scrambled, 4) Pickled, 5) Boiled.\n\nThis year we searched for 'recipes' less and 'restaurant' significantly more.\n\nOur top slimming questions were 'how many calories should i eat in a day' and 'how to lose weight,' and the Paleo diet was the top searched way to trim down.\n\nFoodies in Japan searched French food more than France.\n\nHungry folk in Australia searched Argentine food more than Argentina.\n\nSpice-loving Brits searched Indian food more than India.\n\nIn 2014 'i am hungry' was searched a button-popping 7x more than 'i am thirsty.'\n\nOh, and nine million people watched a tiny hampster eating a tiny burrito.\n\nThe lesson, as always: you are what you search.\n\n-- By Michael Russell Follow @tdmrussell\n\nRelated Stories", + "text_cleaned": "Chia seeds and goji berries are the new kale and quinoa, according to Google's recent parsing of food-focused searches from 2014. Each year, the search giant pours through some of our more fascinating queries to come up with their Year in Search.\n\nSome of the other more interesting food-related data points include:\n\nPizza was searched more than the World Cup.\n\nThe Cronut rose to 17th on the global recipe list after its arrival last year.\n\nOur favorite ways to eat eggs are: 1) Deviled, 2) Scotch, 3) Scrambled, 4) Pickled, 5) Boiled.\n\nThis year we searched for 'recipes' less and 'restaurant' significantly more.\n\nOur top slimming questions were 'how many calories should i eat in a day' and 'how to lose weight,' and the Paleo diet was the top searched way to trim down.\n\nFoodies in Japan searched French food more than France.\n\nHungry folk in Australia searched Argentine food more than Argentina.\n\nSpice-loving Brits searched Indian food more than India.\n\nIn 2014 'i am hungry' was searched a button-popping 7x more than 'i am thirsty.'\n\nOh, and nine million people watched a tiny hampster eating a tiny burrito.\n\nThe lesson, as always: you are what you search." +} \ No newline at end of file diff --git a/tests/data/metadata/cnn_001.json b/tests/data/metadata/cnn_001.json old mode 100755 new mode 100644 index 3e778e3..865e383 --- a/tests/data/metadata/cnn_001.json +++ b/tests/data/metadata/cnn_001.json @@ -11,11 +11,11 @@ ], "movies": [], "keywords": [ - "sparring", "nonstop", + "sparring", "match", - "liberal", "conservative", + "liberal", "attorney", "prelogar", "alito", @@ -23,29 +23,29 @@ "biden", "justice", "supreme", - "im", - "service", + "i’m", "solicitor", - "point", - "justices", + "service", "administration", + "justices", + "point", "case", - "rotc", - "risk", - "making", - "law", - "general", "diversity", - "vote", - "military", + "general", + "law", + "making", + "risk", + "rotc", + "arguments", + "vaccines", + "cases", "lectern", - "generals", + "school", + "general’s", + "vote", "federal", - "cases", - "arguments", "agency", - "vaccines", - "university" + "military" ], "meta_keywords": [ "" @@ -62,5 +62,5 @@ "meta_site_name": "CNN", "canonical_link": "https://www.cnn.com/2023/11/06/politics/alito-prelogar-supreme-court-analysis/index.html", "text": "Justice Samuel Alito is the tip of the spear for conservatives challenging the Biden administration during oral arguments at the Supreme Court.\n\nHe’s a fierce questioner, ready to trap advocates in their arguments. And he becomes demonstrably riled when he fails to get the answer he wants. He shakes his head and rolls his eyes.\n\nSolicitor General Elizabeth Prelogar is the Biden administration’s top lawyer at the court, defending the policies that are the source of much of Alito’s consternation. She responds to him with a steady pitch and precision. And she is not derailed by what he puts down.\n\nTheir jousting over issues such as abortion, vaccines, and all manner of regulatory power offers some of the most riveting exchanges heard these days at America’s high court.\n\nWhen excerpts from their Supreme Court audio appear on YouTube or other social media, it attracts thousands of views, sometimes hundreds of thousands, as court-watchers debate who got the better of the argument.\n\nTheir back-and-forth provides more than drama in the white marble setting. Prelogar, 43, argues the government’s most consequential cases. And the exchanges with Alito, 73, have the potential to influence other justices and affect whether the government wins or loses.\n\nPrelogar will be at the lectern on Tuesday in a major Second Amendment case, defending a law that prohibits persons subject to domestic violence protective orders from possessing a firearm.\n\nAlito, a Trenton, New Jersey, native, once stood at the lectern that Prelogar commands today.\n\nAfter graduating from Princeton and Yale Law School, he joined the Department of Justice and spent about five years in the early 1980s in the solicitor general’s office. President Ronald Reagan named Alito a US attorney in New Jersey and President George H.W. Bush tapped him for a prestigious appellate court post in 1990. President George W. Bush elevated Alito to the high court in January 2006, to succeed the retiring Justice Sandra Day O’Connor.\n\nPrelogar, who grew up in Boise, Idaho, graduated from Emory University and then Harvard Law School. As a teen she entered state pageants and won the Miss Idaho title in 2004. She said she used the pageant scholarship money for law school.\n\n“If you want to look at a through-line here, I like to go in front of judges,” Prelogar said recently about the experience on NPR’s “Wait, Wait … Don’t Tell Me!” The early pageant work may also contribute to her ease at the courtroom lectern and economy of language, shed of the usual “ums” and “ahs” that plague many lawyers.\n\nPrelogar first became familiar with the inner workings of the Supreme Court as a law clerk to liberal Justices Ruth Bader Ginsburg and Elena Kagan. From 2014 to 2019, Prelogar was an assistant to the solicitor general arguing less prominent cases before the justices, and was separately detailed to an investigation into Russian interference in the 2016 presidential election, under special counsel Robert Mueller. In 2021, President Joe Biden nominated her to be US solicitor general, and the Senate confirmed Prelogar by a vote of 53-36. Only six Republicans joined Democrats to approve her.\n\nUnprecedented, but to what end?\n\nSupreme Court oral arguments always begin with some element of suspense: How effectively will lawyers at the lectern make their case and what will justices reveal of their own views? The justices sometimes use these public sessions to press their own positions, with statements cloaked as questions, essentially beginning their negotiations with colleagues.\n\nAny lawyer arguing a progressive position, as Prelogar regularly does, faces an uphill climb, because of the court’s conservative supermajority. For about a half century, the court was generally 5-4, conservative-liberal. Since 2020, it has been 6-3 conservative-liberal.\n\nA few weeks ago, Alito and Prelogar sparred over a federal agency established to protect consumers from risky mortgages, auto loans and credit card deals.\n\nThe case began when payday lenders challenged the constitutionality of Consumer Financial Protection Bureau’s funding structure. Congress – seeking to ensure the agency’s independence – required it to be financed annually through the Federal Reserve System (which itself is funded through bank fees), rather than regular congressional appropriations.\n\nIt was a spirited argument, although the intensity in the courtroom as Alito challenged Prelogar’s justifications may be lost a bit in the cool words from a transcript.\n\n“There have been agencies funded this way for every year of this nation’s history,” Prelogar told the justices, as she defended the bureau established after the country’s 2008 financial crisis.\n\n“What is your best historic, your single best example, of an agency that has all of the features that the CFPB has …” Alito asked in one of his series of queries.\n\n“I think our best example historically is the Customs Service,” Prelogar responded. “The first Congress created the Customs Service in 1789. It gave the Customs Service a standing, uncapped source of funding from the revenues that the Customs Service collected.”\n\nNot satisfied, Alito continued: “What’s your best example of an agency that draws its money from another agency that, in turn, does not get its money from a congressional appropriation in the normal sense of that term but gets it from the private sector?”\n\n“I can’t give you another example of a source that’s precisely like that one,” Prelogar said, “but I would dispute the premise that that could possibly be constitutionally relevant. This is a case about Congress’s own prerogatives over the purse, its authority.”\n\n“So,” Alito declared, “I take it your answer is that you do not … But you think that to the extent it is unprecedented, it is unprecedented in a way that is not relevant for present purposes? Is that your answer?”\n\n“Yes, primarily,” Prelogar said, adding with a bit of cheek. “I think it would be unprecedented in a way that you could say this is the only agency that has the acronym CFPB. That’s obviously true also, but it doesn’t track the constitutional value.”\n\nOvercoming nerves\n\nChief Justice John Roberts, who before becoming a justice also served in the solicitor general’s office and then as a private appellate attorney, was a superb advocate himself. Known for rigorous preparation, he was clear and conversational.\n\nRoberts also acknowledged over the years how nerve-wracking it was. His hands would often shake before he stood up to argue, subdued once he grasped the sides of the lectern and began his presentation.\n\nFrom the bench, Roberts is a tough questioner of Prelogar and Biden administration policy, but without the palpable antagonism that often comes from Alito.\n\nDuring a January 2022 argument over a Biden Covid-19 vaccine requirement for federal workers, Alito began a set of questions with a tone that was alternately aggrieved and assured. He held the upper hand, as a majority of his colleagues were similarly skeptical about the reach of government power.\n\n“I don’t want to be misunderstood in making this point because I’m not saying the vaccines are unsafe. The FDA has approved them. It’s found that they’re safe. It’s said that the benefits greatly outweigh the risks. I’m not contesting that in any way. I don’t want to be misunderstood. I’m sure I will be misunderstood. I just want to emphasize I’m not making that point,” Alito said.\n\n“But,” Alito continued, as he confidently addressed Prelogar, “is it not the case that these vaccines and every other vaccine of which I’m aware and many other medications have benefits and they also have risks and that some people who are vaccinated and some people who take medication that is highly beneficial will suffer adverse consequences? Is that not true of these vaccines?”\n\n“That can be true,” Prelogar said, “but, of course, there is far, far greater risk from being unvaccinated, by orders of magnitude.”\n\n“But … there is some risk,” Alito interjected. “Do you dispute that?”\n\n“There can be a very minimal risk with respect to some individuals, but, again, I would emphasize that there would be no basis to think that these FDA-approved and authorized vaccines are not safe and effective. They are the single-most effective.”\n\nAlito cut her off: “No, I’m not making that point. I tried to make it as clear as I could. I’m not making that point. I’m not making that point. I’m not making that point. There is a risk, right? Has OSHA ever imposed any other safety regulation that imposes some extra risk, some different risk, on the employee?\n\nPrelogar: “I can’t think of anything else that’s precisely like this, but I think that to suggest that OSHA is precluded from using the most common, routine, safe, effective, proven strategy to fight an infectious disease at work would be a departure from how this statute should be understood.”\n\nAs the two continued, talking over each other, they challenged the transcription service based on how many broken sentences and dashes were recorded.\n\nWhen a clip of that exchange over the vaccine requirement of the Occupational Safety and Health Administration (OSHA) was posted online, more than 500,000 people viewed it, and more than 6,000 people left comments.\n\nThe administration lost by a 6-3 vote along ideological lines.\n\nAffirmative action and ‘the nation that we aspire to be’\n\nJeffrey Wall, who was a top official in the solicitor general’s office during the Trump administration, said the Biden solicitor general necessarily faces “headwinds with … a court that is more generally skeptical of government power.”\n\nSpeaking at a Practising Law Institute review in August, Wall overall praised the solicitor general’s record. “I think that General Prelogar should feel pretty good, all things considered, about how the term went,” he said.\n\nWall referred to the high-profile administration loss in the Harvard and University of North Carolina affirmative action cases last session, saying that “no one believes that those cases could have turned on the SG’s advocacy.”\n\nThe affirmative action arguments offered several Alito-Prelogar moments of tension. The Biden administration was backing admissions practices that considered students’ race as a factor in admissions to achieve campus diversity.\n\nPrelogar highlighted repercussions for the military if racial affirmative action was eliminated at the service academies or colleges with ROTC programs that prepare officer candidates.\n\n“What about a college that does not have a ROTC program,” Alito interjected, homing in on what he plainly saw as a specious ROTC reference and larger appeal to the military interest. “Would a plan that would be permissible at a college that has a program be impermissible at the latter, at the one that doesn’t have the ROTC program?”\n\n“We’re not asking the court to draw that distinction,” Prelogar told Alito, asserting that the government’s interest extends “more broadly to other federal agencies, to the federal government’s employment practices itself, and to having a set of leaders in our country who are trained to succeed in diverse environments.”\n\n“Well,” Alito countered, “then I don’t understand the relevance of what you’re saying about the link between college education either at a service academy or a school with an ROTC program and the needs of the military if it doesn’t matter whether the school has no ROTC program and therefore trains no officers.”\n\nPrelogar said the military’s interest in diversity was not confined to the service academies. “We believe deeply in the value of diversity and in universities being able to obtain the educational benefits that correlate with diversity,” she said.\n\nAlito later seized on Prelogar’s attempt to persuade the court to look at “the nation that we aspire to be”; he suggested the approach rang hollow.\n\n“For corporate America,” Prelogar had said as she concluded her arguments in the dispute, “diversity is essential to business solutions. For the medical community and scientific researchers, diversity is an essential element of innovation and delivering better health outcomes.”\n\nAlito told Prelogar it appeared she wanted to take the controversy at hand involving education and extend it to employment. “Is that right?” he asked.\n\n“No, Justice Alito,” Prelogar said. “I was trying to make the observation that the experience of students in those four years of college have effects on the course of their life.”\n\n“Then why were you talking about corporate America,” he rejoined.\n\n“Because corporate America,” she said, “like the United States military, relies on having a diverse pipeline of individuals who had the experience of learning in a diverse educational environment and who themselves reflect the diversity of the American population.”\n\nIn the end, Harvard and the University of North Carolina, along with the Biden administration, lost the dispute by a 6-3 vote along the familiar ideological lines. Roberts, who wrote for the majority opinion that was signed by Alito and the other justices on the right wing, added a footnote that said the decision did not apply to West Point and the other military academies. (Students for Fair Admissions, the group that started the Harvard and University of North Carolina lawsuits, recently filed cases against the service academies.)\n\nIn fair Verona\n\nTo be sure, not all Alito-Prelogar matchups end with ideological divisions. And indeed one of their earliest encounters found the two aligned, in a true theatrical situation, as the Washington-based Shakespeare Theatre Company staged a mock trial in December 2016.\n\nThe Romeo and Juliet tragedy was at the center of the mock case as it tested a wrongful death lawsuit brought by their parents (the Montagues and Capulets) against Friar Laurence, who had secretly married the young couple then helped Juliet fake her death.\n\nAlito presided as the “chief justice” with four “associate justices,” two of whom happened to be then-lower court judges Brett Kavanaugh and Ketanji Brown Jackson (eventually elevated to the real Supreme Court).\n\nPrelogar defended the Friar. Befitting the general theater audience and timing after the November 2016 election, her defense included various political and pop culture (Taylor Swift) references.\n\nOf Friar Lawrence, Prelogar said at one point, “He just wanted to make Verona great again.” Then she delivered her closing remarks in a sonnet form, “because Iambic pentameter is exceedingly persuasive.”\n\nAfter the mock panel deliberated, Alito and the others returned to the bench to deliver their ruling. He praised the lawyers on both sides of the case. “From now on,” he quipped, “I’m going to expect all the briefs from the solicitor general’s office to be in iambic pentameter.”\n\nHe then announced that Prelogar had prevailed in her defense of the Friar. Referring to the fact that the audience had separately taken its own vote, Alito joked, “Since this is a principality, it really doesn’t matter how the people voted.”\n\nPrelogar, it turned out, won that vote, too.", - "text_cleaned": "CNN —\n\nJustice Samuel Alito is the tip of the spear for conservatives challenging the Biden administration during oral arguments at the Supreme Court.\n\nHe’s a fierce questioner, ready to trap advocates in their arguments. And he becomes demonstrably riled when he fails to get the answer he wants. He shakes his head and rolls his eyes.\n\nSolicitor General Elizabeth Prelogar is the Biden administration’s top lawyer at the court, defending the policies that are the source of much of Alito’s consternation. She responds to him with a steady pitch and precision. And she is not derailed by what he puts down.\n\nTheir jousting over issues such as abortion, vaccines, and all manner of regulatory power offers some of the most riveting exchanges heard these days at America’s high court.\n\nWhen excerpts from their Supreme Court audio appear on YouTube or other social media, it attracts thousands of views, sometimes hundreds of thousands, as court-watchers debate who got the better of the argument.\n\nTheir back-and-forth provides more than drama in the white marble setting. Prelogar, 43, argues the government’s most consequential cases. And the exchanges with Alito, 73, have the potential to influence other justices and affect whether the government wins or loses.\n\nPrelogar will be at the lectern on Tuesday in a major Second Amendment case, defending a law that prohibits persons subject to domestic violence protective orders from possessing a firearm.\n\nAlito, a Trenton, New Jersey, native, once stood at the lectern that Prelogar commands today.\n\nAfter graduating from Princeton and Yale Law School, he joined the Department of Justice and spent about five years in the early 1980s in the solicitor general’s office. President Ronald Reagan named Alito a US attorney in New Jersey and President George H.W. Bush tapped him for a prestigious appellate court post in 1990. President George W. Bush elevated Alito to the high court in January 2006, to succeed the retiring Justice Sandra Day O’Connor.\n\nPrelogar, who grew up in Boise, Idaho, graduated from Emory University and then Harvard Law School. As a teen she entered state pageants and won the Miss Idaho title in 2004. She said she used the pageant scholarship money for law school.\n\n“If you want to look at a through-line here, I like to go in front of judges,” Prelogar said recently about the experience on NPR’s “Wait, Wait … Don’t Tell Me!” The early pageant work may also contribute to her ease at the courtroom lectern and economy of language, shed of the usual “ums” and “ahs” that plague many lawyers.\n\nPrelogar first became familiar with the inner workings of the Supreme Court as a law clerk to liberal Justices Ruth Bader Ginsburg and Elena Kagan. From 2014 to 2019, Prelogar was an assistant to the solicitor general arguing less prominent cases before the justices, and was separately detailed to an investigation into Russian interference in the 2016 presidential election, under special counsel Robert Mueller. In 2021, President Joe Biden nominated her to be US solicitor general, and the Senate confirmed Prelogar by a vote of 53-36. Only six Republicans joined Democrats to approve her.\n\nUnprecedented, but to what end?\n\nSupreme Court oral arguments always begin with some element of suspense: How effectively will lawyers at the lectern make their case and what will justices reveal of their own views? The justices sometimes use these public sessions to press their own positions, with statements cloaked as questions, essentially beginning their negotiations with colleagues.\n\nAny lawyer arguing a progressive position, as Prelogar regularly does, faces an uphill climb, because of the court’s conservative supermajority. For about a half century, the court was generally 5-4, conservative-liberal. Since 2020, it has been 6-3 conservative-liberal.\n\nA few weeks ago, Alito and Prelogar sparred over a federal agency established to protect consumers from risky mortgages, auto loans and credit card deals.\n\nThe case began when payday lenders challenged the constitutionality of Consumer Financial Protection Bureau’s funding structure. Congress – seeking to ensure the agency’s independence – required it to be financed annually through the Federal Reserve System (which itself is funded through bank fees), rather than regular congressional appropriations.\n\nIt was a spirited argument, although the intensity in the courtroom as Alito challenged Prelogar’s justifications may be lost a bit in the cool words from a transcript.\n\n“There have been agencies funded this way for every year of this nation’s history,” Prelogar told the justices, as she defended the bureau established after the country’s 2008 financial crisis.\n\nVideo Ad Feedback\n\nHear what happened inside the Supreme Court after historic ruling\n\n05:22 - Source: CNN\n\n“What is your best historic, your single best example, of an agency that has all of the features that the CFPB has …” Alito asked in one of his series of queries.\n\n“I think our best example historically is the Customs Service,” Prelogar responded. “The first Congress created the Customs Service in 1789. It gave the Customs Service a standing, uncapped source of funding from the revenues that the Customs Service collected.”\n\nNot satisfied, Alito continued: “What’s your best example of an agency that draws its money from another agency that, in turn, does not get its money from a congressional appropriation in the normal sense of that term but gets it from the private sector?”\n\n“I can’t give you another example of a source that’s precisely like that one,” Prelogar said, “but I would dispute the premise that that could possibly be constitutionally relevant. This is a case about Congress’s own prerogatives over the purse, its authority.”\n\n“So,” Alito declared, “I take it your answer is that you do not … But you think that to the extent it is unprecedented, it is unprecedented in a way that is not relevant for present purposes? Is that your answer?”\n\n“Yes, primarily,” Prelogar said, adding with a bit of cheek. “I think it would be unprecedented in a way that you could say this is the only agency that has the acronym CFPB. That’s obviously true also, but it doesn’t track the constitutional value.”\n\nOvercoming nerves\n\nChief Justice John Roberts, who before becoming a justice also served in the solicitor general’s office and then as a private appellate attorney, was a superb advocate himself. Known for rigorous preparation, he was clear and conversational.\n\nRoberts also acknowledged over the years how nerve-wracking it was. His hands would often shake before he stood up to argue, subdued once he grasped the sides of the lectern and began his presentation.\n\nFrom the bench, Roberts is a tough questioner of Prelogar and Biden administration policy, but without the palpable antagonism that often comes from Alito.\n\nDuring a January 2022 argument over a Biden Covid-19 vaccine requirement for federal workers, Alito began a set of questions with a tone that was alternately aggrieved and assured. He held the upper hand, as a majority of his colleagues were similarly skeptical about the reach of government power.\n\n“I don’t want to be misunderstood in making this point because I’m not saying the vaccines are unsafe. The FDA has approved them. It’s found that they’re safe. It’s said that the benefits greatly outweigh the risks. I’m not contesting that in any way. I don’t want to be misunderstood. I’m sure I will be misunderstood. I just want to emphasize I’m not making that point,” Alito said.\n\n“But,” Alito continued, as he confidently addressed Prelogar, “is it not the case that these vaccines and every other vaccine of which I’m aware and many other medications have benefits and they also have risks and that some people who are vaccinated and some people who take medication that is highly beneficial will suffer adverse consequences? Is that not true of these vaccines?”\n\n“That can be true,” Prelogar said, “but, of course, there is far, far greater risk from being unvaccinated, by orders of magnitude.”\n\n“But … there is some risk,” Alito interjected. “Do you dispute that?”\n\n“There can be a very minimal risk with respect to some individuals, but, again, I would emphasize that there would be no basis to think that these FDA-approved and authorized vaccines are not safe and effective. They are the single-most effective.”\n\nAlito cut her off: “No, I’m not making that point. I tried to make it as clear as I could. I’m not making that point. I’m not making that point. I’m not making that point. There is a risk, right? Has OSHA ever imposed any other safety regulation that imposes some extra risk, some different risk, on the employee?\n\nPrelogar: “I can’t think of anything else that’s precisely like this, but I think that to suggest that OSHA is precluded from using the most common, routine, safe, effective, proven strategy to fight an infectious disease at work would be a departure from how this statute should be understood.”\n\nAs the two continued, talking over each other, they challenged the transcription service based on how many broken sentences and dashes were recorded.\n\nWhen a clip of that exchange over the vaccine requirement of the Occupational Safety and Health Administration (OSHA) was posted online, more than 500,000 people viewed it, and more than 6,000 people left comments.\n\nThe administration lost by a 6-3 vote along ideological lines.\n\nAffirmative action and ‘the nation that we aspire to be’\n\nJeffrey Wall, who was a top official in the solicitor general’s office during the Trump administration, said the Biden solicitor general necessarily faces “headwinds with … a court that is more generally skeptical of government power.”\n\nSpeaking at a Practising Law Institute review in August, Wall overall praised the solicitor general’s record. “I think that General Prelogar should feel pretty good, all things considered, about how the term went,” he said.\n\nWall referred to the high-profile administration loss in the Harvard and University of North Carolina affirmative action cases last session, saying that “no one believes that those cases could have turned on the SG’s advocacy.”\n\nThe affirmative action arguments offered several Alito-Prelogar moments of tension. The Biden administration was backing admissions practices that considered students’ race as a factor in admissions to achieve campus diversity.\n\nPrelogar highlighted repercussions for the military if racial affirmative action was eliminated at the service academies or colleges with ROTC programs that prepare officer candidates.\n\n“What about a college that does not have a ROTC program,” Alito interjected, homing in on what he plainly saw as a specious ROTC reference and larger appeal to the military interest. “Would a plan that would be permissible at a college that has a program be impermissible at the latter, at the one that doesn’t have the ROTC program?”\n\n“We’re not asking the court to draw that distinction,” Prelogar told Alito, asserting that the government’s interest extends “more broadly to other federal agencies, to the federal government’s employment practices itself, and to having a set of leaders in our country who are trained to succeed in diverse environments.”\n\n“Well,” Alito countered, “then I don’t understand the relevance of what you’re saying about the link between college education either at a service academy or a school with an ROTC program and the needs of the military if it doesn’t matter whether the school has no ROTC program and therefore trains no officers.”\n\nPrelogar said the military’s interest in diversity was not confined to the service academies. “We believe deeply in the value of diversity and in universities being able to obtain the educational benefits that correlate with diversity,” she said.\n\nAlito later seized on Prelogar’s attempt to persuade the court to look at “the nation that we aspire to be”; he suggested the approach rang hollow.\n\n“For corporate America,” Prelogar had said as she concluded her arguments in the dispute, “diversity is essential to business solutions. For the medical community and scientific researchers, diversity is an essential element of innovation and delivering better health outcomes.”\n\nAlito told Prelogar it appeared she wanted to take the controversy at hand involving education and extend it to employment. “Is that right?” he asked.\n\n“No, Justice Alito,” Prelogar said. “I was trying to make the observation that the experience of students in those four years of college have effects on the course of their life.”\n\n“Then why were you talking about corporate America,” he rejoined.\n\n“Because corporate America,” she said, “like the United States military, relies on having a diverse pipeline of individuals who had the experience of learning in a diverse educational environment and who themselves reflect the diversity of the American population.”\n\nIn the end, Harvard and the University of North Carolina, along with the Biden administration, lost the dispute by a 6-3 vote along the familiar ideological lines. Roberts, who wrote for the majority opinion that was signed by Alito and the other justices on the right wing, added a footnote that said the decision did not apply to West Point and the other military academies. (Students for Fair Admissions, the group that started the Harvard and University of North Carolina lawsuits, recently filed cases against the service academies.)\n\nIn fair Verona\n\nTo be sure, not all Alito-Prelogar matchups end with ideological divisions. And indeed one of their earliest encounters found the two aligned, in a true theatrical situation, as the Washington-based Shakespeare Theatre Company staged a mock trial in December 2016.\n\nThe Romeo and Juliet tragedy was at the center of the mock case as it tested a wrongful death lawsuit brought by their parents (the Montagues and Capulets) against Friar Laurence, who had secretly married the young couple then helped Juliet fake her death.\n\nAlito presided as the “chief justice” with four “associate justices,” two of whom happened to be then-lower court judges Brett Kavanaugh and Ketanji Brown Jackson (eventually elevated to the real Supreme Court).\n\nPrelogar defended the Friar. Befitting the general theater audience and timing after the November 2016 election, her defense included various political and pop culture (Taylor Swift) references.\n\nOf Friar Lawrence, Prelogar said at one point, “He just wanted to make Verona great again.” Then she delivered her closing remarks in a sonnet form, “because Iambic pentameter is exceedingly persuasive.”\n\nAfter the mock panel deliberated, Alito and the others returned to the bench to deliver their ruling. He praised the lawyers on both sides of the case. “From now on,” he quipped, “I’m going to expect all the briefs from the solicitor general’s office to be in iambic pentameter.”\n\nHe then announced that Prelogar had prevailed in her defense of the Friar. Referring to the fact that the audience had separately taken its own vote, Alito joked, “Since this is a principality, it really doesn’t matter how the people voted.”\n\nPrelogar, it turned out, won that vote, too." + "text_cleaned": "Justice Samuel Alito is the tip of the spear for conservatives challenging the Biden administration during oral arguments at the Supreme Court.\n\nHe’s a fierce questioner, ready to trap advocates in their arguments. And he becomes demonstrably riled when he fails to get the answer he wants. He shakes his head and rolls his eyes.\n\nSolicitor General Elizabeth Prelogar is the Biden administration’s top lawyer at the court, defending the policies that are the source of much of Alito’s consternation. She responds to him with a steady pitch and precision. And she is not derailed by what he puts down.\n\nTheir jousting over issues such as abortion, vaccines, and all manner of regulatory power offers some of the most riveting exchanges heard these days at America’s high court.\n\nWhen excerpts from their Supreme Court audio appear on YouTube or other social media, it attracts thousands of views, sometimes hundreds of thousands, as court-watchers debate who got the better of the argument.\n\nTheir back-and-forth provides more than drama in the white marble setting. Prelogar, 43, argues the government’s most consequential cases. And the exchanges with Alito, 73, have the potential to influence other justices and affect whether the government wins or loses.\n\nPrelogar will be at the lectern on Tuesday in a major Second Amendment case, defending a law that prohibits persons subject to domestic violence protective orders from possessing a firearm.\n\nAlito, a Trenton, New Jersey, native, once stood at the lectern that Prelogar commands today.\n\nAfter graduating from Princeton and Yale Law School, he joined the Department of Justice and spent about five years in the early 1980s in the solicitor general’s office. President Ronald Reagan named Alito a US attorney in New Jersey and President George H.W. Bush tapped him for a prestigious appellate court post in 1990. President George W. Bush elevated Alito to the high court in January 2006, to succeed the retiring Justice Sandra Day O’Connor.\n\nPrelogar, who grew up in Boise, Idaho, graduated from Emory University and then Harvard Law School. As a teen she entered state pageants and won the Miss Idaho title in 2004. She said she used the pageant scholarship money for law school.\n\n“If you want to look at a through-line here, I like to go in front of judges,” Prelogar said recently about the experience on NPR’s “Wait, Wait … Don’t Tell Me!” The early pageant work may also contribute to her ease at the courtroom lectern and economy of language, shed of the usual “ums” and “ahs” that plague many lawyers.\n\nPrelogar first became familiar with the inner workings of the Supreme Court as a law clerk to liberal Justices Ruth Bader Ginsburg and Elena Kagan. From 2014 to 2019, Prelogar was an assistant to the solicitor general arguing less prominent cases before the justices, and was separately detailed to an investigation into Russian interference in the 2016 presidential election, under special counsel Robert Mueller. In 2021, President Joe Biden nominated her to be US solicitor general, and the Senate confirmed Prelogar by a vote of 53-36. Only six Republicans joined Democrats to approve her.\n\nUnprecedented, but to what end?\n\nSupreme Court oral arguments always begin with some element of suspense: How effectively will lawyers at the lectern make their case and what will justices reveal of their own views? The justices sometimes use these public sessions to press their own positions, with statements cloaked as questions, essentially beginning their negotiations with colleagues.\n\nAny lawyer arguing a progressive position, as Prelogar regularly does, faces an uphill climb, because of the court’s conservative supermajority. For about a half century, the court was generally 5-4, conservative-liberal. Since 2020, it has been 6-3 conservative-liberal.\n\nA few weeks ago, Alito and Prelogar sparred over a federal agency established to protect consumers from risky mortgages, auto loans and credit card deals.\n\nThe case began when payday lenders challenged the constitutionality of Consumer Financial Protection Bureau’s funding structure. Congress – seeking to ensure the agency’s independence – required it to be financed annually through the Federal Reserve System (which itself is funded through bank fees), rather than regular congressional appropriations.\n\nIt was a spirited argument, although the intensity in the courtroom as Alito challenged Prelogar’s justifications may be lost a bit in the cool words from a transcript.\n\n“There have been agencies funded this way for every year of this nation’s history,” Prelogar told the justices, as she defended the bureau established after the country’s 2008 financial crisis.\n\n“What is your best historic, your single best example, of an agency that has all of the features that the CFPB has …” Alito asked in one of his series of queries.\n\n“I think our best example historically is the Customs Service,” Prelogar responded. “The first Congress created the Customs Service in 1789. It gave the Customs Service a standing, uncapped source of funding from the revenues that the Customs Service collected.”\n\nNot satisfied, Alito continued: “What’s your best example of an agency that draws its money from another agency that, in turn, does not get its money from a congressional appropriation in the normal sense of that term but gets it from the private sector?”\n\n“I can’t give you another example of a source that’s precisely like that one,” Prelogar said, “but I would dispute the premise that that could possibly be constitutionally relevant. This is a case about Congress’s own prerogatives over the purse, its authority.”\n\n“So,” Alito declared, “I take it your answer is that you do not … But you think that to the extent it is unprecedented, it is unprecedented in a way that is not relevant for present purposes? Is that your answer?”\n\n“Yes, primarily,” Prelogar said, adding with a bit of cheek. “I think it would be unprecedented in a way that you could say this is the only agency that has the acronym CFPB. That’s obviously true also, but it doesn’t track the constitutional value.”\n\nOvercoming nerves\n\nChief Justice John Roberts, who before becoming a justice also served in the solicitor general’s office and then as a private appellate attorney, was a superb advocate himself. Known for rigorous preparation, he was clear and conversational.\n\nRoberts also acknowledged over the years how nerve-wracking it was. His hands would often shake before he stood up to argue, subdued once he grasped the sides of the lectern and began his presentation.\n\nFrom the bench, Roberts is a tough questioner of Prelogar and Biden administration policy, but without the palpable antagonism that often comes from Alito.\n\nDuring a January 2022 argument over a Biden Covid-19 vaccine requirement for federal workers, Alito began a set of questions with a tone that was alternately aggrieved and assured. He held the upper hand, as a majority of his colleagues were similarly skeptical about the reach of government power.\n\n“I don’t want to be misunderstood in making this point because I’m not saying the vaccines are unsafe. The FDA has approved them. It’s found that they’re safe. It’s said that the benefits greatly outweigh the risks. I’m not contesting that in any way. I don’t want to be misunderstood. I’m sure I will be misunderstood. I just want to emphasize I’m not making that point,” Alito said.\n\n“But,” Alito continued, as he confidently addressed Prelogar, “is it not the case that these vaccines and every other vaccine of which I’m aware and many other medications have benefits and they also have risks and that some people who are vaccinated and some people who take medication that is highly beneficial will suffer adverse consequences? Is that not true of these vaccines?”\n\n“That can be true,” Prelogar said, “but, of course, there is far, far greater risk from being unvaccinated, by orders of magnitude.”\n\n“But … there is some risk,” Alito interjected. “Do you dispute that?”\n\n“There can be a very minimal risk with respect to some individuals, but, again, I would emphasize that there would be no basis to think that these FDA-approved and authorized vaccines are not safe and effective. They are the single-most effective.”\n\nAlito cut her off: “No, I’m not making that point. I tried to make it as clear as I could. I’m not making that point. I’m not making that point. I’m not making that point. There is a risk, right? Has OSHA ever imposed any other safety regulation that imposes some extra risk, some different risk, on the employee?\n\nPrelogar: “I can’t think of anything else that’s precisely like this, but I think that to suggest that OSHA is precluded from using the most common, routine, safe, effective, proven strategy to fight an infectious disease at work would be a departure from how this statute should be understood.”\n\nAs the two continued, talking over each other, they challenged the transcription service based on how many broken sentences and dashes were recorded.\n\nWhen a clip of that exchange over the vaccine requirement of the Occupational Safety and Health Administration (OSHA) was posted online, more than 500,000 people viewed it, and more than 6,000 people left comments.\n\nThe administration lost by a 6-3 vote along ideological lines.\n\nAffirmative action and ‘the nation that we aspire to be’\n\nJeffrey Wall, who was a top official in the solicitor general’s office during the Trump administration, said the Biden solicitor general necessarily faces “headwinds with … a court that is more generally skeptical of government power.”\n\nSpeaking at a Practising Law Institute review in August, Wall overall praised the solicitor general’s record. “I think that General Prelogar should feel pretty good, all things considered, about how the term went,” he said.\n\nWall referred to the high-profile administration loss in the Harvard and University of North Carolina affirmative action cases last session, saying that “no one believes that those cases could have turned on the SG’s advocacy.”\n\nThe affirmative action arguments offered several Alito-Prelogar moments of tension. The Biden administration was backing admissions practices that considered students’ race as a factor in admissions to achieve campus diversity.\n\nPrelogar highlighted repercussions for the military if racial affirmative action was eliminated at the service academies or colleges with ROTC programs that prepare officer candidates.\n\n“What about a college that does not have a ROTC program,” Alito interjected, homing in on what he plainly saw as a specious ROTC reference and larger appeal to the military interest. “Would a plan that would be permissible at a college that has a program be impermissible at the latter, at the one that doesn’t have the ROTC program?”\n\n“We’re not asking the court to draw that distinction,” Prelogar told Alito, asserting that the government’s interest extends “more broadly to other federal agencies, to the federal government’s employment practices itself, and to having a set of leaders in our country who are trained to succeed in diverse environments.”\n\n“Well,” Alito countered, “then I don’t understand the relevance of what you’re saying about the link between college education either at a service academy or a school with an ROTC program and the needs of the military if it doesn’t matter whether the school has no ROTC program and therefore trains no officers.”\n\nPrelogar said the military’s interest in diversity was not confined to the service academies. “We believe deeply in the value of diversity and in universities being able to obtain the educational benefits that correlate with diversity,” she said.\n\nAlito later seized on Prelogar’s attempt to persuade the court to look at “the nation that we aspire to be”; he suggested the approach rang hollow.\n\n“For corporate America,” Prelogar had said as she concluded her arguments in the dispute, “diversity is essential to business solutions. For the medical community and scientific researchers, diversity is an essential element of innovation and delivering better health outcomes.”\n\nAlito told Prelogar it appeared she wanted to take the controversy at hand involving education and extend it to employment. “Is that right?” he asked.\n\n“No, Justice Alito,” Prelogar said. “I was trying to make the observation that the experience of students in those four years of college have effects on the course of their life.”\n\n“Then why were you talking about corporate America,” he rejoined.\n\n“Because corporate America,” she said, “like the United States military, relies on having a diverse pipeline of individuals who had the experience of learning in a diverse educational environment and who themselves reflect the diversity of the American population.”\n\nIn the end, Harvard and the University of North Carolina, along with the Biden administration, lost the dispute by a 6-3 vote along the familiar ideological lines. Roberts, who wrote for the majority opinion that was signed by Alito and the other justices on the right wing, added a footnote that said the decision did not apply to West Point and the other military academies. (Students for Fair Admissions, the group that started the Harvard and University of North Carolina lawsuits, recently filed cases against the service academies.)\n\nIn fair Verona\n\nTo be sure, not all Alito-Prelogar matchups end with ideological divisions. And indeed one of their earliest encounters found the two aligned, in a true theatrical situation, as the Washington-based Shakespeare Theatre Company staged a mock trial in December 2016.\n\nThe Romeo and Juliet tragedy was at the center of the mock case as it tested a wrongful death lawsuit brought by their parents (the Montagues and Capulets) against Friar Laurence, who had secretly married the young couple then helped Juliet fake her death.\n\nAlito presided as the “chief justice” with four “associate justices,” two of whom happened to be then-lower court judges Brett Kavanaugh and Ketanji Brown Jackson (eventually elevated to the real Supreme Court).\n\nPrelogar defended the Friar. Befitting the general theater audience and timing after the November 2016 election, her defense included various political and pop culture (Taylor Swift) references.\n\nOf Friar Lawrence, Prelogar said at one point, “He just wanted to make Verona great again.” Then she delivered her closing remarks in a sonnet form, “because Iambic pentameter is exceedingly persuasive.”\n\nAfter the mock panel deliberated, Alito and the others returned to the bench to deliver their ruling. He praised the lawyers on both sides of the case. “From now on,” he quipped, “I’m going to expect all the briefs from the solicitor general’s office to be in iambic pentameter.”\n\nHe then announced that Prelogar had prevailed in her defense of the Friar. Referring to the fact that the audience had separately taken its own vote, Alito joked, “Since this is a principality, it really doesn’t matter how the people voted.”\n\nPrelogar, it turned out, won that vote, too." } \ No newline at end of file diff --git a/tests/data/metadata/cnn_002.json b/tests/data/metadata/cnn_002.json old mode 100755 new mode 100644 index b9aa4e5..085c7cb --- a/tests/data/metadata/cnn_002.json +++ b/tests/data/metadata/cnn_002.json @@ -28,40 +28,40 @@ "movies": [], "keywords": [ "takeaways", - "fraud", "donald", "contentious", - "trumps", + "trump’s", "trial", - "testimony", "civil", + "testimony", + "fraud", "trump", + "wallace", "attorney", - "statements", "judge", "engoron", - "wallace", + "statements", "president", - "million", - "loans", - "loan", - "hes", "case", - "york", - "worth", - "tower", - "political", "kise", - "generals", - "general", + "million", + "he’s", + "worth", + "loan", + "loans", "witness", - "statement", "stand", - "rhetoric", + "general", + "general’s", + "political", "questioning", - "presidents", - "net", - "maralago" + "it’s", + "tower", + "statement", + "rhetoric", + "york", + "president’s", + "asked" ], "meta_keywords": [ "" @@ -74,12 +74,12 @@ "Douglas Wood" ], "publish_date": "2023-11-06T00:00:00", - "summary": "Trump’s attacks in a vacuum weren’t particularly remarkable – he’s used the same barbs throughout the course of the trial, which he’s attended repeatedly as a spectator.\nTrump’s rhetoric prompts an angry response from the judge Engoron tried at the outset of Trump’s testimony to stop the former president from making speeches and instead answer the questions, but it did little to change Trump’s approach.\n“I am not here to hear what he has to say,” Engoron said, raising his voice and telling Trump attorney Alina Habba and Kise to sit down.\nAfter a morning break, Engoron took on a more passive role in policing Trump’s statements.\n“Are you aware that the Trump Chicago loan was paid off last week?” Wallace asked Trump.", + "summary": "The high-stakes civil case strikes at the heart of Trump’s brand – his real estate empire.\nTrump’s attacks in a vacuum weren’t particularly remarkable – he’s used the same barbs throughout the course of the trial, which he’s attended repeatedly as a spectator.\nTrump’s rhetoric prompts an angry response from the judge Engoron tried at the outset of Trump’s testimony to stop the former president from making speeches and instead answer the questions, but it did little to change Trump’s approach.\n“I am not here to hear what he has to say,” Engoron said, raising his voice and telling Trump attorney Alina Habba and Kise to sit down.\nAfter a morning break, Engoron took on a more passive role in policing Trump’s statements.", "meta_description": "Donald Trump brought bombastic rhetoric to the witness stand Monday in the civil fraud case against him and his business, as he spent his time on the stand attacking the New York attorney general who brought the case and the judge overseeing the trial itself.", "meta_lang": "en", "meta_favicon": "/media/sites/cnn/apple-touch-icon.png", "meta_site_name": "CNN", "canonical_link": "https://www.cnn.com/2023/11/06/politics/takeaways-trump-engoron-testimony-fraud-trial/index.html", "text": "Donald Trump brought bombastic rhetoric to the witness stand Monday in the civil fraud case against him and his business, as he spent his time on the stand attacking the New York attorney general who brought the case and the judge overseeing the trial itself.\n\nTrump’s testimony at times mimicked his appearances on the campaign trail, where the former president has made the four criminal cases against him – along with the New York attorney general’s civil fraud case – a central part of his argument to be elected president again in 2024.\n\nJudge Arthur Engoron, who has clashed with Trump throughout the trial, at first tried to stop the former president’s political barbs and speechifying, telling his lawyer Chris Kise to “control your client” and threatening to have Trump removed as a witness.\n\nEventually, the judge stropped trying to control Trump – he and the attorney general’s lawyer questioning Trump let him rant, and then mostly disregarded the missives.\n\nThe high-stakes civil case strikes at the heart of Trump’s brand – his real estate empire. New York Attorney General Letitia James is suing Trump for $250 million and seeking to bar him from doing business in the state. Engoron has already ruled Trump and his co-defendants were liable for fraud.\n\nHere are the takeaways from Trump’s day on the stand:\n\nThe campaign comes to the courtroom\n\nThe former president’s rhetoric at times during his testimony might as well have been at one of his rallies in front of supporters. He went after the attorney general. The judge. And the “political witch hunt” that he’s been railing against for years now.\n\n“This is a political witch hunt and I think she should be ashamed of herself,” Trump said of James.\n\nTrump’s attacks in a vacuum weren’t particularly remarkable – he’s used the same barbs throughout the course of the trial, which he’s attended repeatedly as a spectator.\n\nBut on the witness stand, the charged rhetoric was even more remarkable, as he attacked the judge sitting right next to him, with James in the courtroom watching his testimony just feet away.\n\n“The fraud is on the court, not on me,” Trump said.\n\n“It’s a terrible thing you’ve done,” he said to the judge. “You believe this political hack back there and that’s unfortunate.”\n\nKevin Wallace, the lawyer with the attorney general’s office who questioned Trump, tried to pin down the former president. But Trump, as is his speaking style, found ways to intertwine asides and attacks into his responses even when he was answering the question.\n\nAfter one particularly long monologue, Wallace asked Trump: “Done?”\n\n“Done,” Trump said.\n\nTrump’s rhetoric prompts an angry response from the judge\n\nEngoron tried at the outset of Trump’s testimony to stop the former president from making speeches and instead answer the questions, but it did little to change Trump’s approach.\n\nThe judge responded by threatening to remove Trump from the witness stand, though that didn’t deter the former president either.\n\n“This is not a political rally,” Engoron said to Trump, telling Kise to “control your client.”\n\nKise responded by arguing to Engoron to let Trump speak. At one point, Kise called his client’s answers “brilliant.”\n\n“The court needs to hear what he has to say about these statements,” Kise said. “He’s describing to you about why there was no intent to mislead anyone with his answers. That’s what he’s doing.”\n\nEngoron did not agree.\n\n“I am not here to hear what he has to say,” Engoron said, raising his voice and telling Trump attorney Alina Habba and Kise to sit down. “We are here to hear him answer questions, and most of the time he’s not.”\n\nBefore the testimony could resume, Trump weighed in. “”This is a very unfair trial. Very, very and I hope the public is watching,” he said.\n\nAfter a morning break, Engoron took on a more passive role in policing Trump’s statements. He told Wallace that he was following his lead on Trump’s answers, “if you want to let the witness ramble on, nonresponsive, repeat himself.”\n\nOf course, antagonizing the judge will only get Trump so far in the trial: The civil action is a non-jury trial, so Engoron will decide the outcome.\n\nTrump acknowledges changing valuation of Trump Tower triplex\n\nThe attorney general’s office pressed Trump on the properties central to his identity and brand: Mar-a-Lago, Trump Tower and other key parts of his real estate empire.\n\nWallace pressed Trump to acknowledge the differing values in his statements of financial condition, the financial documents that have been ruled to have fraudulently inflated the former president’s net worth to obtain better loan rates. An expert for the attorney general determined the Trump Organization saved $168 million in ill-gotten gains.\n\nWallace pressed Trump on why valuations of properties were changed, such as his Trump Tower triplex, which was devalued on his financial statement in 2017 after a Forbes article found he had dramatically exaggerated the size of the apartment.\n\nTrump did acknowledge in a noteworthy exchange that there were mistakes in the financial statements, such as the Trump Tower apartment valuation.\n\nThe value of the apartment fell from $327 million in 2016 to roughly $116.8 million in 2017 – which came after Forbes Magazine outed Trump in 2017 for claiming the apartment was more than 30,000 square feet when it turned out to be just under 11,000 square feet. Wallace asked Trump whether he was involved in the change.\n\n“Probably,” Trump said, before giving several possible explanations.\n\nHe acknowledged there could have “been a mistake” but said that’s why his statements included disclaimer clauses and that banks are responsible for their own due diligence.\n\n“There’s a disclaimer clause where you don’t have to get sued by the attorney general of New York,” Trump said.\n\n“If I wanted to build up the statement like you said I did before you found out just how rich we are, I would’ve added brand value here and I would’ve increased it by tens of millions of dollars,” Trump said at another point in the questioning.\n\nTrump’s beautiful, expensive properties\n\nThe former president’s rhetorical flourishes went beyond attacking those who are investigating him. He also took the opportunity to play salesman and play up his properties.\n\nOne of his chief complaints about the judge is a citation in his decision that Mar-a-Lago was worth $18 million, a number based on Florida tax appraisal records.\n\n“It’s much more valuable,” Trump said of Mar-a-Lago, “and we’ll show that in two weeks or five weeks or nine weeks or whenever this thing goes, that it’s biggest value is using it as a club.”\n\nWallace took the answer to pin him down on that valuation. “You believe that as of today Mar-a-Lago is worth $1.5 billion?” Wallace asked.\n\n“I think between a billion and a billion-five,” Trump responded.\n\nWhen Trump was questioned about his golf resort in Aberdeen, Scotland, the former president was less interested in explaining why there were discrepancies in the number of housing units he had intended to build on his financial statement than he was talking up his piece of land.\n\n“I think it’s the greatest golf course ever built,” Trump said. “It’s one of the greatest pieces of land I’ve ever seen.”\n\nTrump was responsible for the loans at issue in AG’s suit\n\nWallace spent the final hour of questioning Trump focused on multiple loans the Trump Organization received from Deutsche Bank, which hit at the heart of the civil case against the former president.\n\nWallace had Trump confirm that he signed each of the documents and acknowledge that they all included clauses requiring annual financial statements that were accurate and that he maintain $50 million in unencumbered cash and a $2.5 billion net worth.\n\nTrump acknowledged the conditions on the loan, but argued his net worth is much higher than his statements ever showed and that the loans had been paid off.\n\n“This loan was paid off in full, with no default, with no problem and the bank was thrilled. They got all their money back,” Trump said. “The bank liked me very much.”\n\nWallace concluded the line of questioning by asking Trump to confirm that he did not believe his financial statements inflated his net worth. Trump said they were “very good” before launching into another attack that prompted Engoron to quip the questioning sounded like a “broken record.”\n\nThe attorney general’s office spent the time going through the various loans with Trump – and the question on his financial statement – because the loans are a key part of the case. The AG’s complaint alleges Trump defrauded the bank by providing the fraudulent, inflated financial statements to obtain the loans and to maintain them in the years after the initial transaction.\n\nTrump also let his looseness with the facts get in the way at one point. When the former president was talking about paying back the loan for his Chicago property – Trump International Hotel & Tower – he said the loan was “long since gone.”\n\nThe answer prompted Wallace to perk up. “Are you aware that the Trump Chicago loan was paid off last week?” Wallace asked Trump.\n\n“I don’t know last week, but I know recently. On time, on schedule,” Trump replied, ignoring the contradiction of his previous statement.\n\nThis story has been updated with additional developments.", - "text_cleaned": "Video Ad Feedback\n\nTrump speaks to reporters after testimony in civil fraud trial\n\n03:29 - Source: CNN\n\nPolitics of the Day 15 videos\n\nVideo Ad Feedback\n\nTrump speaks to reporters after testimony in civil fraud trial\n\n03:29\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nChristie says any Republican who is overconfident about beating Biden is 'foolish'\n\n01:16\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nHear Democratic incumbent Gov. Andy Beshear speak after reelection\n\n00:57\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nCollins presses Ramaswamy on Ohio abortion vote\n\n03:47\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\n'Shocking': Van Jones responds to new Trump-Biden poll\n\n01:26\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nHaberman makes prediction on Ivanka Trump's tactic for testimony\n\n01:00\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nHere's what is on the ballot across the US on Election Day 2023\n\n03:04\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nHow the abortion debate takes center stage in this small community\n\n04:25\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nCNN legal analyst explains key term in Trump civil trial\n\n01:22\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\n'Is this the right thing to do?': Axelrod poses questions for Biden around reelection\n\n02:32\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nWatch what Trump did when asked a question outside the courtroom\n\n02:12\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nMarjorie Taylor Greene: Republican voters are sick and tired of GOP\n\n01:14\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nBernie Sanders responds to ad accusing Biden of supporting 'genocide' in Gaza\n\n03:04\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\nChris Christie to crowd: 'Your anger against the truth is reprehensible'\n\n02:27\n\nNow playing\n\n- Source: CNN\n\nVideo Ad Feedback\n\n'Why would you trust Putin?': CNN anchor presses presidential candidate\n\n02:18\n\nNow playing\n\n- Source: CNN\n\nNew York CNN —\n\nDonald Trump brought bombastic rhetoric to the witness stand Monday in the civil fraud case against him and his business, as he spent his time on the stand attacking the New York attorney general who brought the case and the judge overseeing the trial itself.\n\nTrump’s testimony at times mimicked his appearances on the campaign trail, where the former president has made the four criminal cases against him – along with the New York attorney general’s civil fraud case – a central part of his argument to be elected president again in 2024.\n\nJudge Arthur Engoron, who has clashed with Trump throughout the trial, at first tried to stop the former president’s political barbs and speechifying, telling his lawyer Chris Kise to “control your client” and threatening to have Trump removed as a witness.\n\nEventually, the judge stropped trying to control Trump – he and the attorney general’s lawyer questioning Trump let him rant, and then mostly disregarded the missives.\n\nThe high-stakes civil case strikes at the heart of Trump’s brand – his real estate empire. New York Attorney General Letitia James is suing Trump for $250 million and seeking to bar him from doing business in the state. Engoron has already ruled Trump and his co-defendants were liable for fraud.\n\nVideo Ad Feedback\n\nWatch what Trump did when asked a question outside the courtroom\n\n02:12 - Source: CNN\n\nHere are the takeaways from Trump’s day on the stand:\n\nThe campaign comes to the courtroom\n\nThe former president’s rhetoric at times during his testimony might as well have been at one of his rallies in front of supporters. He went after the attorney general. The judge. And the “political witch hunt” that he’s been railing against for years now.\n\n“This is a political witch hunt and I think she should be ashamed of herself,” Trump said of James.\n\nTrump’s attacks in a vacuum weren’t particularly remarkable – he’s used the same barbs throughout the course of the trial, which he’s attended repeatedly as a spectator.\n\nBut on the witness stand, the charged rhetoric was even more remarkable, as he attacked the judge sitting right next to him, with James in the courtroom watching his testimony just feet away.\n\n“The fraud is on the court, not on me,” Trump said.\n\n“It’s a terrible thing you’ve done,” he said to the judge. “You believe this political hack back there and that’s unfortunate.”\n\nKevin Wallace, the lawyer with the attorney general’s office who questioned Trump, tried to pin down the former president. But Trump, as is his speaking style, found ways to intertwine asides and attacks into his responses even when he was answering the question.\n\nAfter one particularly long monologue, Wallace asked Trump: “Done?”\n\n“Done,” Trump said.\n\nTrump’s rhetoric prompts an angry response from the judge\n\nEngoron tried at the outset of Trump’s testimony to stop the former president from making speeches and instead answer the questions, but it did little to change Trump’s approach.\n\nThe judge responded by threatening to remove Trump from the witness stand, though that didn’t deter the former president either.\n\n“This is not a political rally,” Engoron said to Trump, telling Kise to “control your client.”\n\nKise responded by arguing to Engoron to let Trump speak. At one point, Kise called his client’s answers “brilliant.”\n\n“The court needs to hear what he has to say about these statements,” Kise said. “He’s describing to you about why there was no intent to mislead anyone with his answers. That’s what he’s doing.”\n\nEngoron did not agree.\n\n“I am not here to hear what he has to say,” Engoron said, raising his voice and telling Trump attorney Alina Habba and Kise to sit down. “We are here to hear him answer questions, and most of the time he’s not.”\n\nBefore the testimony could resume, Trump weighed in. “”This is a very unfair trial. Very, very and I hope the public is watching,” he said.\n\nAfter a morning break, Engoron took on a more passive role in policing Trump’s statements. He told Wallace that he was following his lead on Trump’s answers, “if you want to let the witness ramble on, nonresponsive, repeat himself.”\n\nOf course, antagonizing the judge will only get Trump so far in the trial: The civil action is a non-jury trial, so Engoron will decide the outcome.\n\nVideo Ad Feedback\n\n'Crash and burn': Elie Honig reacts to Trump's courtroom strategy\n\n01:22 - Source: CNN\n\nTrump acknowledges changing valuation of Trump Tower triplex\n\nThe attorney general’s office pressed Trump on the properties central to his identity and brand: Mar-a-Lago, Trump Tower and other key parts of his real estate empire.\n\nWallace pressed Trump to acknowledge the differing values in his statements of financial condition, the financial documents that have been ruled to have fraudulently inflated the former president’s net worth to obtain better loan rates. An expert for the attorney general determined the Trump Organization saved $168 million in ill-gotten gains.\n\nWallace pressed Trump on why valuations of properties were changed, such as his Trump Tower triplex, which was devalued on his financial statement in 2017 after a Forbes article found he had dramatically exaggerated the size of the apartment.\n\nTrump did acknowledge in a noteworthy exchange that there were mistakes in the financial statements, such as the Trump Tower apartment valuation.\n\nThe value of the apartment fell from $327 million in 2016 to roughly $116.8 million in 2017 – which came after Forbes Magazine outed Trump in 2017 for claiming the apartment was more than 30,000 square feet when it turned out to be just under 11,000 square feet. Wallace asked Trump whether he was involved in the change.\n\n“Probably,” Trump said, before giving several possible explanations.\n\nHe acknowledged there could have “been a mistake” but said that’s why his statements included disclaimer clauses and that banks are responsible for their own due diligence.\n\n“There’s a disclaimer clause where you don’t have to get sued by the attorney general of New York,” Trump said.\n\n“If I wanted to build up the statement like you said I did before you found out just how rich we are, I would’ve added brand value here and I would’ve increased it by tens of millions of dollars,” Trump said at another point in the questioning.\n\nTrump’s beautiful, expensive properties\n\nThe former president’s rhetorical flourishes went beyond attacking those who are investigating him. He also took the opportunity to play salesman and play up his properties.\n\nOne of his chief complaints about the judge is a citation in his decision that Mar-a-Lago was worth $18 million, a number based on Florida tax appraisal records.\n\n“It’s much more valuable,” Trump said of Mar-a-Lago, “and we’ll show that in two weeks or five weeks or nine weeks or whenever this thing goes, that it’s biggest value is using it as a club.”\n\nWallace took the answer to pin him down on that valuation. “You believe that as of today Mar-a-Lago is worth $1.5 billion?” Wallace asked.\n\n“I think between a billion and a billion-five,” Trump responded.\n\nWhen Trump was questioned about his golf resort in Aberdeen, Scotland, the former president was less interested in explaining why there were discrepancies in the number of housing units he had intended to build on his financial statement than he was talking up his piece of land.\n\n“I think it’s the greatest golf course ever built,” Trump said. “It’s one of the greatest pieces of land I’ve ever seen.”\n\nTrump was responsible for the loans at issue in AG’s suit\n\nWallace spent the final hour of questioning Trump focused on multiple loans the Trump Organization received from Deutsche Bank, which hit at the heart of the civil case against the former president.\n\nWallace had Trump confirm that he signed each of the documents and acknowledge that they all included clauses requiring annual financial statements that were accurate and that he maintain $50 million in unencumbered cash and a $2.5 billion net worth.\n\nTrump acknowledged the conditions on the loan, but argued his net worth is much higher than his statements ever showed and that the loans had been paid off.\n\n“This loan was paid off in full, with no default, with no problem and the bank was thrilled. They got all their money back,” Trump said. “The bank liked me very much.”\n\nWallace concluded the line of questioning by asking Trump to confirm that he did not believe his financial statements inflated his net worth. Trump said they were “very good” before launching into another attack that prompted Engoron to quip the questioning sounded like a “broken record.”\n\nThe attorney general’s office spent the time going through the various loans with Trump – and the question on his financial statement – because the loans are a key part of the case. The AG’s complaint alleges Trump defrauded the bank by providing the fraudulent, inflated financial statements to obtain the loans and to maintain them in the years after the initial transaction.\n\nTrump also let his looseness with the facts get in the way at one point. When the former president was talking about paying back the loan for his Chicago property – Trump International Hotel & Tower – he said the loan was “long since gone.”\n\nThe answer prompted Wallace to perk up. “Are you aware that the Trump Chicago loan was paid off last week?” Wallace asked Trump.\n\n“I don’t know last week, but I know recently. On time, on schedule,” Trump replied, ignoring the contradiction of his previous statement.\n\nThis story has been updated with additional developments." + "text_cleaned": "Donald Trump brought bombastic rhetoric to the witness stand Monday in the civil fraud case against him and his business, as he spent his time on the stand attacking the New York attorney general who brought the case and the judge overseeing the trial itself.\n\nTrump’s testimony at times mimicked his appearances on the campaign trail, where the former president has made the four criminal cases against him – along with the New York attorney general’s civil fraud case – a central part of his argument to be elected president again in 2024.\n\nJudge Arthur Engoron, who has clashed with Trump throughout the trial, at first tried to stop the former president’s political barbs and speechifying, telling his lawyer Chris Kise to “control your client” and threatening to have Trump removed as a witness.\n\nEventually, the judge stropped trying to control Trump – he and the attorney general’s lawyer questioning Trump let him rant, and then mostly disregarded the missives.\n\nThe high-stakes civil case strikes at the heart of Trump’s brand – his real estate empire. New York Attorney General Letitia James is suing Trump for $250 million and seeking to bar him from doing business in the state. Engoron has already ruled Trump and his co-defendants were liable for fraud.\n\nHere are the takeaways from Trump’s day on the stand:\n\nThe campaign comes to the courtroom\n\nThe former president’s rhetoric at times during his testimony might as well have been at one of his rallies in front of supporters. He went after the attorney general. The judge. And the “political witch hunt” that he’s been railing against for years now.\n\n“This is a political witch hunt and I think she should be ashamed of herself,” Trump said of James.\n\nTrump’s attacks in a vacuum weren’t particularly remarkable – he’s used the same barbs throughout the course of the trial, which he’s attended repeatedly as a spectator.\n\nBut on the witness stand, the charged rhetoric was even more remarkable, as he attacked the judge sitting right next to him, with James in the courtroom watching his testimony just feet away.\n\n“The fraud is on the court, not on me,” Trump said.\n\n“It’s a terrible thing you’ve done,” he said to the judge. “You believe this political hack back there and that’s unfortunate.”\n\nKevin Wallace, the lawyer with the attorney general’s office who questioned Trump, tried to pin down the former president. But Trump, as is his speaking style, found ways to intertwine asides and attacks into his responses even when he was answering the question.\n\nAfter one particularly long monologue, Wallace asked Trump: “Done?”\n\n“Done,” Trump said.\n\nTrump’s rhetoric prompts an angry response from the judge\n\nEngoron tried at the outset of Trump’s testimony to stop the former president from making speeches and instead answer the questions, but it did little to change Trump’s approach.\n\nThe judge responded by threatening to remove Trump from the witness stand, though that didn’t deter the former president either.\n\n“This is not a political rally,” Engoron said to Trump, telling Kise to “control your client.”\n\nKise responded by arguing to Engoron to let Trump speak. At one point, Kise called his client’s answers “brilliant.”\n\n“The court needs to hear what he has to say about these statements,” Kise said. “He’s describing to you about why there was no intent to mislead anyone with his answers. That’s what he’s doing.”\n\nEngoron did not agree.\n\n“I am not here to hear what he has to say,” Engoron said, raising his voice and telling Trump attorney Alina Habba and Kise to sit down. “We are here to hear him answer questions, and most of the time he’s not.”\n\nBefore the testimony could resume, Trump weighed in. “”This is a very unfair trial. Very, very and I hope the public is watching,” he said.\n\nAfter a morning break, Engoron took on a more passive role in policing Trump’s statements. He told Wallace that he was following his lead on Trump’s answers, “if you want to let the witness ramble on, nonresponsive, repeat himself.”\n\nOf course, antagonizing the judge will only get Trump so far in the trial: The civil action is a non-jury trial, so Engoron will decide the outcome.\n\nTrump acknowledges changing valuation of Trump Tower triplex\n\nThe attorney general’s office pressed Trump on the properties central to his identity and brand: Mar-a-Lago, Trump Tower and other key parts of his real estate empire.\n\nWallace pressed Trump to acknowledge the differing values in his statements of financial condition, the financial documents that have been ruled to have fraudulently inflated the former president’s net worth to obtain better loan rates. An expert for the attorney general determined the Trump Organization saved $168 million in ill-gotten gains.\n\nWallace pressed Trump on why valuations of properties were changed, such as his Trump Tower triplex, which was devalued on his financial statement in 2017 after a Forbes article found he had dramatically exaggerated the size of the apartment.\n\nTrump did acknowledge in a noteworthy exchange that there were mistakes in the financial statements, such as the Trump Tower apartment valuation.\n\nThe value of the apartment fell from $327 million in 2016 to roughly $116.8 million in 2017 – which came after Forbes Magazine outed Trump in 2017 for claiming the apartment was more than 30,000 square feet when it turned out to be just under 11,000 square feet. Wallace asked Trump whether he was involved in the change.\n\n“Probably,” Trump said, before giving several possible explanations.\n\nHe acknowledged there could have “been a mistake” but said that’s why his statements included disclaimer clauses and that banks are responsible for their own due diligence.\n\n“There’s a disclaimer clause where you don’t have to get sued by the attorney general of New York,” Trump said.\n\n“If I wanted to build up the statement like you said I did before you found out just how rich we are, I would’ve added brand value here and I would’ve increased it by tens of millions of dollars,” Trump said at another point in the questioning.\n\nTrump’s beautiful, expensive properties\n\nThe former president’s rhetorical flourishes went beyond attacking those who are investigating him. He also took the opportunity to play salesman and play up his properties.\n\nOne of his chief complaints about the judge is a citation in his decision that Mar-a-Lago was worth $18 million, a number based on Florida tax appraisal records.\n\n“It’s much more valuable,” Trump said of Mar-a-Lago, “and we’ll show that in two weeks or five weeks or nine weeks or whenever this thing goes, that it’s biggest value is using it as a club.”\n\nWallace took the answer to pin him down on that valuation. “You believe that as of today Mar-a-Lago is worth $1.5 billion?” Wallace asked.\n\n“I think between a billion and a billion-five,” Trump responded.\n\nWhen Trump was questioned about his golf resort in Aberdeen, Scotland, the former president was less interested in explaining why there were discrepancies in the number of housing units he had intended to build on his financial statement than he was talking up his piece of land.\n\n“I think it’s the greatest golf course ever built,” Trump said. “It’s one of the greatest pieces of land I’ve ever seen.”\n\nTrump was responsible for the loans at issue in AG’s suit\n\nWallace spent the final hour of questioning Trump focused on multiple loans the Trump Organization received from Deutsche Bank, which hit at the heart of the civil case against the former president.\n\nWallace had Trump confirm that he signed each of the documents and acknowledge that they all included clauses requiring annual financial statements that were accurate and that he maintain $50 million in unencumbered cash and a $2.5 billion net worth.\n\nTrump acknowledged the conditions on the loan, but argued his net worth is much higher than his statements ever showed and that the loans had been paid off.\n\n“This loan was paid off in full, with no default, with no problem and the bank was thrilled. They got all their money back,” Trump said. “The bank liked me very much.”\n\nWallace concluded the line of questioning by asking Trump to confirm that he did not believe his financial statements inflated his net worth. Trump said they were “very good” before launching into another attack that prompted Engoron to quip the questioning sounded like a “broken record.”\n\nThe attorney general’s office spent the time going through the various loans with Trump – and the question on his financial statement – because the loans are a key part of the case. The AG’s complaint alleges Trump defrauded the bank by providing the fraudulent, inflated financial statements to obtain the loans and to maintain them in the years after the initial transaction.\n\nTrump also let his looseness with the facts get in the way at one point. When the former president was talking about paying back the loan for his Chicago property – Trump International Hotel & Tower – he said the loan was “long since gone.”\n\nThe answer prompted Wallace to perk up. “Are you aware that the Trump Chicago loan was paid off last week?” Wallace asked Trump.\n\n“I don’t know last week, but I know recently. On time, on schedule,” Trump replied, ignoring the contradiction of his previous statement.\n\nThis story has been updated with additional developments." } \ No newline at end of file diff --git a/tests/data/metadata/cnn_article.json b/tests/data/metadata/cnn_article.json new file mode 100644 index 0000000..efe3fda --- /dev/null +++ b/tests/data/metadata/cnn_article.json @@ -0,0 +1,78 @@ +{ + "url": "https://edition.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html", + "read_more_link": "", + "language": "en", + "title": "After storm, forecasters see smooth sailing for Thanksgiving", + "top_image": "http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg", + "meta_img": "http://i2.cdn.turner.com/cnn/dam/assets/131129200805-01-weather-1128-story-top.jpg", + "images": [ + "http://i.cdn.turner.com/cnn/images/1.gif", + "http://i.cdn.turner.com/cnn/.element/img/3.0/global/header/intl/hdr-globe-central.gif", + "http://i.cdn.turner.com/cnn/.e/img/3.0/global/misc/advertisement.gif", + "http://i.cdn.turner.com/cnn/.element/img/3.0/1px.gif", + "http://i.cdn.turner.com/cnn/.e/img/3.0/global/misc/advertisement.gif", + "http://z.cdn.turner.com/cnn/.element/img/3.0/global/footer/pngs/footer_google.png", + "http://z.cdn.turner.com/cnn/.e/img/3.0/global/footer/pngs/footer_cnn_logo.png", + "http://z.cdn.turner.com/cnn/.e/img/3.0/global/misc/logo_ad_choices_footer.png" + ], + "movies": [], + "keywords": [ + "forecasters", + "smooth", + "sailing", + "storm", + "thanksgiving", + "weather", + "balloons", + "travel", + "delays", + "snow", + "good", + "flight", + "roads", + "winds", + "cnn", + "great", + "airport", + "york", + "parade", + "hit", + "pennsylvania", + "department", + "transportation", + "night", + "worst", + "holiday", + "day", + "northeast", + "e-mail", + "m", + "forecast", + "air", + "balloon", + "reported", + "cancellations" + ], + "meta_keywords": [ + "winter storm", + "holiday travel", + "Thanksgiving storm", + "Thanksgiving winter storm" + ], + "tags": null, + "authors": [ + "Dana A. Ford", + "James S.A. Corey", + "Chien-Ming Wang", + "Tom Watkins" + ], + "publish_date": "2013-11-27T00:00:00", + "summary": "The storm caused some complications and inconveniences, but no major delays or breakdowns.\nForecasters see mostly smooth sailing into Thanksgiving.\nThat's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend.\n\"I was second in line checking my bag with Delta (checked into my flight last night) and security was a breeze.\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade.", + "meta_description": "A strong storm struck much of the eastern United States on Wednesday, complicating holiday plans for many of the 43 million Americans expected to travel.", + "meta_lang": "en", + "meta_favicon": "http://i.cdn.turner.com/cnn/.e/img/3.0/global/misc/apple-touch-icon.png", + "meta_site_name": "CNN", + "canonical_link": "http://www.cnn.com/2013/11/27/travel/weather-thanksgiving/index.html", + "text": "Are you in the grips of the wintry storm sweeping across the country? Please share your videos, pictures and stories at iReport.\n\n(CNN) -- The Pennsylvania official was just talking about one area, but he summed up a winter storm that struck much of the eastern United States on Wednesday.\n\n\"We dodged a bullet,\" said Steve Cowan, a spokesman for the Pennsylvania Department of Transportation.\n\nAs night fell, travelers and transportation authorities breathed a collective sigh of relief as the worst of holiday travel fears failed to materialize. The storm caused some complications and inconveniences, but no major delays or breakdowns.\n\nForecasters see mostly smooth sailing into Thanksgiving.\n\nCNN meteorologist Todd Borek predicted some lake-effect snowfall over the Great Lakes and fresh snow from a weak disturbance for the Upper Midwest.\n\nBut, he added: \"The worst in terms of widespread snow and rain is over. Tomorrow will be a quieter day, although wind gusts will continue to be a problem overnight and Thursday for the Northeast.\"\n\nThat's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend.\n\nAbney said Wednesday that she arrived more than two hours early at Washington's Reagan National Airport to catch a flight to New York's JFK.\n\n\"I thought the lines were going to be ridiculous,\" she said in an e-mail. \"I was second in line checking my bag with Delta (checked into my flight last night) and security was a breeze. I walked right up, the TSA agent checked my info and I immediately started the security process. And now the wait begins. It's pretty quiet by the gates. Not too many people roaming around. So far, so good! Happy Thanksgiving!!!!\"\n\nBrian M. Good said he, too, was expecting a horrible trip when he departed New York for Newark to get a flight to San Diego.\n\n\"Instead the roads were dead,\" he said in an e-mail. \"It's warm outside and it stopped raining. No lines at the airport and flight is on time. Wish the forecasters were wrong all the time :)\"\n\nWill winds whip parade balloons?\n\nThough the worst of the storm has passed, winds could still pose a problem.\n\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade. They are to be grounded if sustained winds reach 23 mph or gusts exceed 34 mph -- both slightly above predicted strength.\n\nA decision will be made Thursday morning before the parade's 9 a.m. ET scheduled start.\n\n\"Tomorrow before the event, we'll make a determination -- the police department, the incident commander -- whether the balloons will fly or not,\" Patrol Chief James Hall with the New York Police Department said Wednesday.\n\n\"It looks good. It looks very good,\" he said about the possibility of balloons in the air.\n\nBut there is ample reason to support the caution.\n\nIn 1997, a woman spent more than three weeks in a coma after the Cat in the Hat balloon -- tossed by heavy winds -- struck a pole that hit her. In 2005, two other people were hurt in a similar incident involving the M&Ms balloon.\n\nOfficials say that improved weather monitoring devices en route and a police sergeant assigned to each balloon will minimize any danger.\n\n\"We came all the way from Puerto Rico to see the parade, so it will be a disappointment if we don't see the balloons,\" said Jose Ramirez, who was in New York with his family.\n\nEither way, the parade -- with or without the balloons -- will go on, organizers say.\n\nPlanes, trains ...\n\nDelays were reported at several airports in the Northeast on Wednesday.\n\nDon't get trapped by bad Thanksgiving weather: Top tips\n\nHeavy winds at all three New York-area airports -- LaGuardia, John F. Kennedy and Newark-Liberty -- resulted in delays of 30 minutes to an hour, according to the Federal Aviation Administration. Departure delays from Philadelphia International Airport averaged nearly two hours.\n\nAmerican Airlines' Kent Powell reported three cancellations, none of them related to the weather.\n\nAt US Airways, Todd Lehmacher called the impact of the weather \"pretty minimal, at this point,\" with six weather-related cancellations and an on-time performance of about 67%. The impact was more pronounced at US Airways Express, which tallied 56 cancellations because of the weather, he said.\n\n\"The real story is people are getting to their destinations, albeit a little delayed in some but not all cases,\" he said.\n\n#ATL24: A day in the life of the world's busiest airport\n\nAmtrak reported no major delays systemwide. Using the weather as a marketing tool, the nation's rail system was adding seats on some routes.\n\n\"Rail travel remains one of the most reliable and comfortable transportation options, especially in weather conditions that negatively impact other modes,\" Amtrak said.\n\nThere may be something to that.\n\n\"QUIET CAR. Window seat. Polite seatmate. I have hit the Amtrak travel trifecta. #blessed,\" Ellie Hall tweeted early Wednesday.\n\n5 healthy eating tips for holiday travelers\n\n... and automobiles\n\nSnow blanketed parts of the Midwest, where crews scrambled to clear roads. The storm was blamed for scores of accidents.\n\nUp to a foot of snow fell in parts of Pennsylvania, and it was falling from upstate New York into Canada, where more than a foot was possible. Snow also continued to fly in the central Appalachians and around the Great Lakes as cold air moved in and produced lake-effect snows.\n\nRoad conditions were not great in much of the Northeast.\n\n\"It's sleet; it's rain; it's 31 degrees. It's ugly out there,\" CNN meteorologist Chad Myers said.\n\nStill, a call for anecdotes elicited a number of comments, but no horror stories.\n\n\"Well, since the forecast said we would be hitting an ice storm on our way, we ended up leaving the night before we had planned and took an alternate route,\" Sarah Martini said in an e-mail. \"This morning we took smaller roads to avoid DC traffic and have made great progress! We managed to avoid bad weather and traffic and are getting close to getting to Pittsburgh from NC!\"\n\nThe National Highway Traffic Safety Administration said that 416 motorists died during Thanksgiving weekend last year, that 60% of the dead had not been wearing seat belts and that 42% of the accidents involved a drunken driver.\n\nLast week, 12 people died, most of them in car crashes, when one of the fronts making up the current storm iced roads from the Rockies to Texas and Oklahoma. More than 100 vehicles ended up in wrecks.\n\n\"I get on the highway, and the next thing I know I'm spinning,\" said Seqret Watson, among the dozens of drivers in Northwest Arkansas sent sliding when their cars hit icy bridges and roads.\n\n\"I try to grab my wheel and then I just hit the wall. Just jumped out to make sure my kids were OK,\" Watson told affiliate KFSM.\n\nThe Peterson family had initially planned to drive from Northern Virginia to Massachusetts. But after seeing the forecast, they booked seats on a flight at the last minute.\n\n\"It was a small fortune,\" Jennifer Peterson told CNN affiliate WUSA. \"We could've gone to the Bahamas for what we paid!\"\n\nStorm prompts airlines to relax travel policies\n\nCNN's Jason Carroll, Ben Brumfield, Dave Hennen, Aaron Cooper, Alexandra Field, Shannon Travis and Greg Botelho contributed to this report.", + "text_cleaned": "Are you in the grips of the wintry storm sweeping across the country? Please share your videos, pictures and stories at iReport.\n\n(CNN) -- The Pennsylvania official was just talking about one area, but he summed up a winter storm that struck much of the eastern United States on Wednesday.\n\n\"We dodged a bullet,\" said Steve Cowan, a spokesman for the Pennsylvania Department of Transportation.\n\nAs night fell, travelers and transportation authorities breathed a collective sigh of relief as the worst of holiday travel fears failed to materialize. The storm caused some complications and inconveniences, but no major delays or breakdowns.\n\nForecasters see mostly smooth sailing into Thanksgiving.\n\nCNN meteorologist Todd Borek predicted some lake-effect snowfall over the Great Lakes and fresh snow from a weak disturbance for the Upper Midwest.\n\nBut, he added: \"The worst in terms of widespread snow and rain is over. Tomorrow will be a quieter day, although wind gusts will continue to be a problem overnight and Thursday for the Northeast.\"\n\nThat's good news for people like Latasha Abney, who joined the more than 43 million Americans expected by AAA to travel over the Thanksgiving holiday weekend.\n\nAbney said Wednesday that she arrived more than two hours early at Washington's Reagan National Airport to catch a flight to New York's JFK.\n\n\"I thought the lines were going to be ridiculous,\" she said in an e-mail. \"I was second in line checking my bag with Delta (checked into my flight last night) and security was a breeze. I walked right up, the TSA agent checked my info and I immediately started the security process. And now the wait begins. It's pretty quiet by the gates. Not too many people roaming around. So far, so good! Happy Thanksgiving!!!!\"\n\nBrian M. Good said he, too, was expecting a horrible trip when he departed New York for Newark to get a flight to San Diego.\n\n\"Instead the roads were dead,\" he said in an e-mail. \"It's warm outside and it stopped raining. No lines at the airport and flight is on time. Wish the forecasters were wrong all the time :)\"\n\nWill winds whip parade balloons?\n\nThough the worst of the storm has passed, winds could still pose a problem.\n\nThe forecast has left up in the air the fate of the balloons in Macy's Thanksgiving Day Parade. They are to be grounded if sustained winds reach 23 mph or gusts exceed 34 mph -- both slightly above predicted strength.\n\nA decision will be made Thursday morning before the parade's 9 a.m. ET scheduled start.\n\n\"Tomorrow before the event, we'll make a determination -- the police department, the incident commander -- whether the balloons will fly or not,\" Patrol Chief James Hall with the New York Police Department said Wednesday.\n\n\"It looks good. It looks very good,\" he said about the possibility of balloons in the air.\n\nBut there is ample reason to support the caution.\n\nIn 1997, a woman spent more than three weeks in a coma after the Cat in the Hat balloon -- tossed by heavy winds -- struck a pole that hit her. In 2005, two other people were hurt in a similar incident involving the M&Ms balloon.\n\nOfficials say that improved weather monitoring devices en route and a police sergeant assigned to each balloon will minimize any danger.\n\n\"We came all the way from Puerto Rico to see the parade, so it will be a disappointment if we don't see the balloons,\" said Jose Ramirez, who was in New York with his family.\n\nEither way, the parade -- with or without the balloons -- will go on, organizers say.\n\nPlanes, trains ...\n\nDelays were reported at several airports in the Northeast on Wednesday.\n\nDon't get trapped by bad Thanksgiving weather: Top tips\n\nHeavy winds at all three New York-area airports -- LaGuardia, John F. Kennedy and Newark-Liberty -- resulted in delays of 30 minutes to an hour, according to the Federal Aviation Administration. Departure delays from Philadelphia International Airport averaged nearly two hours.\n\nAmerican Airlines' Kent Powell reported three cancellations, none of them related to the weather.\n\nAt US Airways, Todd Lehmacher called the impact of the weather \"pretty minimal, at this point,\" with six weather-related cancellations and an on-time performance of about 67%. The impact was more pronounced at US Airways Express, which tallied 56 cancellations because of the weather, he said.\n\n\"The real story is people are getting to their destinations, albeit a little delayed in some but not all cases,\" he said.\n\n#ATL24: A day in the life of the world's busiest airport\n\nAmtrak reported no major delays systemwide. Using the weather as a marketing tool, the nation's rail system was adding seats on some routes.\n\n\"Rail travel remains one of the most reliable and comfortable transportation options, especially in weather conditions that negatively impact other modes,\" Amtrak said.\n\nThere may be something to that.\n\n\"QUIET CAR. Window seat. Polite seatmate. I have hit the Amtrak travel trifecta. #blessed,\" Ellie Hall tweeted early Wednesday.\n\n5 healthy eating tips for holiday travelers\n\n... and automobiles\n\nSnow blanketed parts of the Midwest, where crews scrambled to clear roads. The storm was blamed for scores of accidents.\n\nUp to a foot of snow fell in parts of Pennsylvania, and it was falling from upstate New York into Canada, where more than a foot was possible. Snow also continued to fly in the central Appalachians and around the Great Lakes as cold air moved in and produced lake-effect snows.\n\nRoad conditions were not great in much of the Northeast.\n\n\"It's sleet; it's rain; it's 31 degrees. It's ugly out there,\" CNN meteorologist Chad Myers said.\n\nStill, a call for anecdotes elicited a number of comments, but no horror stories.\n\n\"Well, since the forecast said we would be hitting an ice storm on our way, we ended up leaving the night before we had planned and took an alternate route,\" Sarah Martini said in an e-mail. \"This morning we took smaller roads to avoid DC traffic and have made great progress! We managed to avoid bad weather and traffic and are getting close to getting to Pittsburgh from NC!\"\n\nThe National Highway Traffic Safety Administration said that 416 motorists died during Thanksgiving weekend last year, that 60% of the dead had not been wearing seat belts and that 42% of the accidents involved a drunken driver.\n\nLast week, 12 people died, most of them in car crashes, when one of the fronts making up the current storm iced roads from the Rockies to Texas and Oklahoma. More than 100 vehicles ended up in wrecks.\n\n\"I get on the highway, and the next thing I know I'm spinning,\" said Seqret Watson, among the dozens of drivers in Northwest Arkansas sent sliding when their cars hit icy bridges and roads.\n\n\"I try to grab my wheel and then I just hit the wall. Just jumped out to make sure my kids were OK,\" Watson told affiliate KFSM.\n\nThe Peterson family had initially planned to drive from Northern Virginia to Massachusetts. But after seeing the forecast, they booked seats on a flight at the last minute.\n\n\"It was a small fortune,\" Jennifer Peterson told CNN affiliate WUSA. \"We could've gone to the Bahamas for what we paid!\"\n\nStorm prompts airlines to relax travel policies\n\nCNN's Jason Carroll, Ben Brumfield, Dave Hennen, Aaron Cooper, Alexandra Field, Shannon Travis and Greg Botelho contributed to this report." +} \ No newline at end of file diff --git a/tests/data/metadata/fox13now_001.json b/tests/data/metadata/fox13now_001.json old mode 100755 new mode 100644 index ea33d20..b5a7fb8 --- a/tests/data/metadata/fox13now_001.json +++ b/tests/data/metadata/fox13now_001.json @@ -1,37 +1,68 @@ { + "url": "https://www.fox13now.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones", + "read_more_link": "", + "language": "en", "title": "New Year, new laws: Obamacare, pot, guns and drones", - "top_img": "https://www.fox13now.com/apple-touch-icon.png", - "meta_img": "https://www.fox13now.com/apple-touch-icon.png", - "imgs": "{'https://ewscripps.brightspotcdn.com/dims4/default/aa09ace/2147483647/strip/true/crop/340x133+0+0/resize/340x133!/quality/90/?url=http%3A%2F%2Fewscripps-brightspot.s3.amazonaws.com%2F24%2Fa4%2F8e8db4e3481aa736296f1eda4ee7%2Fmain-logo.png', 'https://www.fox13now.com/apple-touch-icon.png', 'https://ewscripps.brightspotcdn.com/0b/0c/0184c2f44e4783b067c6ee93fcc0/fox13webad.jpg', 'https://assets.scrippsdigital.com/cms/images/logo-scripps.png', 'https://www.fox13now.com/styleguide/assets/Blank.gif'}", + "top_image": "https://ewscripps.brightspotcdn.com/0b/0c/0184c2f44e4783b067c6ee93fcc0/fox13webad.jpg", + "meta_img": "https://www.fox13now.com/favicon-32x32.png", + "images": [ + "https://ewscripps.brightspotcdn.com/dims4/default/aa09ace/2147483647/strip/true/crop/340x133+0+0/resize/340x133!/quality/90/?url=http%3A%2F%2Fewscripps-brightspot.s3.amazonaws.com%2F24%2Fa4%2F8e8db4e3481aa736296f1eda4ee7%2Fmain-logo.png", + "https://ewscripps.brightspotcdn.com/dims4/default/aa09ace/2147483647/strip/true/crop/340x133+0+0/resize/340x133!/quality/90/?url=http%3A%2F%2Fewscripps-brightspot.s3.amazonaws.com%2F24%2Fa4%2F8e8db4e3481aa736296f1eda4ee7%2Fmain-logo.png", + "https://www.fox13now.com/styleguide/assets/Blank.gif", + "https://ewscripps.brightspotcdn.com/0b/0c/0184c2f44e4783b067c6ee93fcc0/fox13webad.jpg" + ], "movies": [], "keywords": [ - "latest", - "states", - "family", + "pot", "guns", - "minimum", "laws", - "national", - "law", - "state", - "wage", + "drones", "obamacare", + "wage", + "minimum", + "state", + "illinois", + "california", + "family", + "connecticut", + "national", + "care", "leave", - "pot", - "drones" + "oregon", + "law", + "latest", + "gender", + "workers", + "hour", + "job", + "americans", + "marijuana", + "colorado", + "legislatures", + "federal", + "health", + "vending", + "machines", + "receive", + "provide", + "shark", + "fins", + "school" ], "meta_keywords": [ "" ], - "tags": "set()", + "tags": null, "authors": [ - "Cnn Wire" + "CNN Wire" ], - "publish_date": "2013-12-30 00:00:00", - "summary": "Oregon: Family leave in Oregon has been expanded to allow eligible employees two weeks of paid leave to handle the death of a family member.\nArkansas: The state becomes the latest state requiring voters show a picture ID at the voting booth.\nMinimum wage and former felon employment Workers in 13 states and four cities will see increases to the minimum wage.\nNew Jersey residents voted to raise the state’s minimum wage by $1 to $8.25 per hour.\nCalifornia is also raising its minimum wage to $9 per hour, but workers must wait until July to see the addition.", + "publish_date": "2013-12-30T00:00:00", + "summary": "The previous law had a minimum wage requirement.\nIllinois and drones Illinois: passed two laws limiting the use of drones.\nMinimum wage and former felon employment Workers in 13 states and four cities will see increases to the minimum wage.\nNew Jersey residents voted to raise the state’s minimum wage by $1 to $8.25 per hour.\nCalifornia is also raising its minimum wage to $9 per hour, but workers must wait until July to see the addition.", "meta_description": "By Leigh Ann Caldwell CNN WASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014. Some 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibiti", "meta_lang": "en", "meta_favicon": "/apple-touch-icon.png", "meta_site_name": "FOX 13 News Utah (KSTU)", - "canonical_link": "https://localtvkstu.wordpress.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/" + "canonical_link": "https://localtvkstu.wordpress.com/2013/12/30/new-year-new-laws-obamacare-pot-guns-and-drones/", + "text": "WASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014.\n\nSome 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibition in Illinois and transgender protections in California.\n\nAlthough many new laws are controversial, they made it through legislatures, public referendum or city councils and represent the shifting composition of American beliefs.\n\nFederal: Health care, of course, and vending machines\n\nThe biggest and most politically charged change comes at the federal level with the imposition of a new fee for those adults without health insurance.\n\nFor 2014, the penalty is either $95 per adult or 1% of family income, whichever results in a larger fine.\n\nThe Obamacare, of Affordable Care Act, mandate also requires that insurers cover immunizations and some preventive care.\n\nAdditionally, millions of poor Americans will receive Medicaid benefits starting January 1.\n\nThousands of companies will have to provide calorie counts for products sold in vending machines.\n\nLocal: Guns, family leave and shark fins\n\nConnecticut: While no national legislation was approved to tighten gun laws a year after the Newtown school shooting, Connecticut is implementing a final round of changes to its books: All assault weapons and large capacity magazines must be registered.\n\nOregon: Family leave in Oregon has been expanded to allow eligible employees two weeks of paid leave to handle the death of a family member.\n\nCalifornia: Homeless youth are eligible to receive food stamps. The previous law had a minimum wage requirement.\n\nDelaware: Delaware is the latest in a growing number of states where residents can no longer possess, sell or distribute shark fins, which is considered a delicacy in some East Asian cuisine.\n\nIllinois and drones\n\nIllinois: passed two laws limiting the use of drones. One prohibits them from interfering with hunters and fisherman. The measure passed after the group People for the Ethical Treatment of Animals said it would use drones to monitor hunters. PETA said it aims through its “air angels” effort to protect against “cruel” and “illegal” hunting.\n\nAlso in Illinois, another law prohibits the use of drones for law enforcement without a warrant.\n\nGender and voting identity\n\nCalifornia: Students can use bathrooms and join school athletic teams “consistent with their gender identity,” even if it’s different than their gender at birth.\n\nArkansas: The state becomes the latest state requiring voters show a picture ID at the voting booth.\n\nMinimum wage and former felon employment\n\nWorkers in 13 states and four cities will see increases to the minimum wage.\n\nWhile most amount to less than 15 cents per hour, workers in places like New Jersey, and Connecticut.\n\nNew Jersey residents voted to raise the state’s minimum wage by $1 to $8.25 per hour. And in Connecticut, lawmakers voted to raise it between 25 and 75 cents to $8.70. The wage would go up to $8 in Rhode Island and New York.\n\nCalifornia is also raising its minimum wage to $9 per hour, but workers must wait until July to see the addition.\n\nRhode Island: It is the latest state to prohibit employers from requiring job applicants to signify if they have a criminal record on a job application.\n\nSocial media and pot\n\nOregon: Employers and schools can’t require a job or student applicant to provide passwords to social media accounts.\n\nColorado: Marijuana becomes legal in the state for buyers over 21 at a licensed retail dispensary.\n\n(Sourcing: much of this list was obtained from the National Conference of State Legislatures).", + "text_cleaned": "WASHINGTON (CNN) — Not everyone subscribes to a New Year’s resolution, but Americans will be required to follow new laws in 2014.\n\nSome 40,000 measures taking effect range from sweeping, national mandates under Obamacare to marijuana legalization in Colorado, drone prohibition in Illinois and transgender protections in California.\n\nAlthough many new laws are controversial, they made it through legislatures, public referendum or city councils and represent the shifting composition of American beliefs.\n\nFederal: Health care, of course, and vending machines\n\nThe biggest and most politically charged change comes at the federal level with the imposition of a new fee for those adults without health insurance.\n\nFor 2014, the penalty is either $95 per adult or 1% of family income, whichever results in a larger fine.\n\nThe Obamacare, of Affordable Care Act, mandate also requires that insurers cover immunizations and some preventive care.\n\nAdditionally, millions of poor Americans will receive Medicaid benefits starting January 1.\n\nThousands of companies will have to provide calorie counts for products sold in vending machines.\n\nLocal: Guns, family leave and shark fins\n\nConnecticut: While no national legislation was approved to tighten gun laws a year after the Newtown school shooting, Connecticut is implementing a final round of changes to its books: All assault weapons and large capacity magazines must be registered.\n\nOregon: Family leave in Oregon has been expanded to allow eligible employees two weeks of paid leave to handle the death of a family member.\n\nCalifornia: Homeless youth are eligible to receive food stamps. The previous law had a minimum wage requirement.\n\nDelaware: Delaware is the latest in a growing number of states where residents can no longer possess, sell or distribute shark fins, which is considered a delicacy in some East Asian cuisine.\n\nIllinois and drones\n\nIllinois: passed two laws limiting the use of drones. One prohibits them from interfering with hunters and fisherman. The measure passed after the group People for the Ethical Treatment of Animals said it would use drones to monitor hunters. PETA said it aims through its “air angels” effort to protect against “cruel” and “illegal” hunting.\n\nAlso in Illinois, another law prohibits the use of drones for law enforcement without a warrant.\n\nGender and voting identity\n\nCalifornia: Students can use bathrooms and join school athletic teams “consistent with their gender identity,” even if it’s different than their gender at birth.\n\nArkansas: The state becomes the latest state requiring voters show a picture ID at the voting booth.\n\nMinimum wage and former felon employment\n\nWorkers in 13 states and four cities will see increases to the minimum wage.\n\nWhile most amount to less than 15 cents per hour, workers in places like New Jersey, and Connecticut.\n\nNew Jersey residents voted to raise the state’s minimum wage by $1 to $8.25 per hour. And in Connecticut, lawmakers voted to raise it between 25 and 75 cents to $8.70. The wage would go up to $8 in Rhode Island and New York.\n\nCalifornia is also raising its minimum wage to $9 per hour, but workers must wait until July to see the addition.\n\nRhode Island: It is the latest state to prohibit employers from requiring job applicants to signify if they have a criminal record on a job application.\n\nSocial media and pot\n\nOregon: Employers and schools can’t require a job or student applicant to provide passwords to social media accounts.\n\nColorado: Marijuana becomes legal in the state for buyers over 21 at a licensed retail dispensary.\n\n(Sourcing: much of this list was obtained from the National Conference of State Legislatures)." } \ No newline at end of file diff --git a/tests/data/metadata/japanese_article.json b/tests/data/metadata/japanese_article.json new file mode 100644 index 0000000..43327fd --- /dev/null +++ b/tests/data/metadata/japanese_article.json @@ -0,0 +1,104 @@ +{ + "url": "https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/", + "read_more_link": "", + "language": "ja", + "title": "米朝首脳直通電話 習氏に挑むトランプ戦術の成否 (写真=ロイター) :日本経済新聞", + "top_image": "https://www.nikkei.com/content/pic/20180619/96958A9F889DE1E3EAEBE5E4E4E2E3EAE2E4E0E2E3EAE2E2E2E2E2E2-DSXMZO3190512018062018000001-PB1-2.jpg", + "meta_img": "https://www.nikkei.com/content/pic/20180619/96958A9F889DE1E3EAEBE5E4E4E2E3EAE2E4E0E2E3EAE2E2E2E2E2E2-DSXMZO3190512018062018000001-PB1-2.jpg", + "images": [ + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/logo_nikkei_header_2017.svg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/icon_paper.svg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/icon_nkd.svg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/icon_jinji.svg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/icon_mynews.svg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/btn_localmenu_pulldown_close_r1_off.gif", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/btn_localmenu_pulldown_close_r1_off.gif", + "https://www.nikkei.com/content/pic/20170509/96958A88889DE3E4E2EBE6E4E0E2E2EAE2E7E0E2E3E5E2E2E2E2E2E0-DSXZZO1609463008052017000002-NONE-2.png", + "https://www.nikkei.com/content/pic/20110203/96958A88889DE1E3E7EBEBE2EAE2E3E3E2E4E0E2E3EAE2E2E2E2E2E5-DSXZZO3159909011062018000007-NONE-1.png", + "https://www.nikkei.com/content/pic/20110823/96958A88889DE1E6E2E3E2E4EBE2E3E4E2EAE0E2E3E3E2E2E2E2E4E7-DSXZZO1850683005072017000002-NONE-1.png", + "https://www.nikkei.com/content/pic/20110511/96958A88889DE0EAE1E2E5E2E0E2E3E3E2E7E0E2E3E3E2E2E2E2E3E1-DSXZZO1850690005072017000004-NONE-2.png", + "https://www.nikkei.com/content/pic/20110203/96958A88889DE4EBE5E0E6E0EBE2E3E3E2E6E0E2E3E6E2E2E2E2E6E3-DSXZZO7818739009102014000010-NONE-28.png", + "https://www.nikkei.com/content/pic/20171107/96958A88889DE1E3E3E1EAE2EAE2E0E5E2E4E0E2E3E3E2E2E2E2E2E4-DSXZZO2302779002112017000000-NONE-7.png", + "https://www.nikkei.com/content/pic/20170509/96958A88889DE3E4E2EBE6E4E0E2E2EAE2E7E0E2E3E5E2E2E2E2E2E6-DSXZZO1609463008052017000004-NONE-3.png", + "https://www.nikkei.com/content/pic/20110203/96958A88889DE1E3E7EBEBE2EAE2E3E3E2E4E0E2E3EAE2E2E2E2E3E0-DSXZZO3159909011062018000012-NONE-1.png", + "https://www.nikkei.com/content/pic/20150515/96958A88889DEAE4EAE0E7EBE1E2E3E7E2E7E0E2E3E7E2E2E2E2E2E1-DSXZZO8682594015052015000003-NONE-5.png", + "https://www.nikkei.com/content/pic/20150515/96958A88889DEAE4EAE0E7E5E5E2E3E7E2E7E0E2E3E7E2E2E2E2E2E4-DSXZZO8682578015052015000006-NONE-2.png", + "https://www.nikkei.com/content/pic/20150513/96958A88889DEAE4E5E3E1E2E0E2E3E1E2E7E0E2E3E7E2E2E2E2E2EB-DSXZZO8671303013052015000009-NONE-1.png", + "https://www.nikkei.com/content/pic/20150515/96958A88889DEAE4E5EAE1E0E4E2E3E6E2E7E0E2E3E7E2E2E2E2E0E1-DSXZZO2878299030032018000000-NONE-1.png", + "https://www.nikkei.com/content/pic/20150514/96958A88889DEAE4E5E5EBE3E7E2E3E6E2E7E0E2E3E7E2E2E2E2E2EB-DSXZZO2652967005022018000003-NONE-1.jpg", + "https://www.nikkei.com/content/pic/20150707/96958A88889DEAEAEBE7E3E3E2E2E2E4E2E5E0E2E3E7E2E2E2E2E2E2-DSXZZO8895118006072015000000-NONE-2.jpg", + "https://www.nikkei.com/content/pic/20171208/96958A88889DEAE4E5EAE1E0E4E2E3E6E2E7E0E2E3E7E2E2E2E2E3EB-DSXZZO2441528008122017000000-NONE-2.jpg", + "https://www.nikkei.com/content/pic/20180401/96958A88889DE0EAE4E1EAEAEBE2E0E5E2E1E0E2E3EAE2E2E2E2E2E2-DSXZZO2863931027032018000002-NONE-1.jpg", + "https://www.nikkei.com/content/pic/20160523/96958A88889DE3E4EBE7EAE7E1E2E0E6E3E2E0E2E3E2E2E2E2E2E1E3-DSXZZO2985848026042018000000-PS1-1.jpg", + "https://www.nikkei.com/content/pic/20101024/96958A88889DE3E4EBE7EAE7E1E2E0E6E3E2E0E2E3E2E2E2E2E2E2EB-DSXZZO1982335012122010000009-PS1-15.jpg", + "https://assets.nikkei.jp/release/v3.1.23/parts/ds/images/common/logo_nikkei_footer.svg" + ], + "movies": [], + "keywords": [ + " ", + "脳直", + "通電", + "話", + "習氏", + "挑む", + "戦術", + "成否", + "写真", + "=", + "ロイター", + " :", + "日本経済新聞", + "米", + "、", + "トランプ", + "(", + ")", + "朝首", + "。", + "的", + "談", + "巡っ", + "金正", + "恩", + "・", + "中国", + "習近", + "平", + "「", + "」", + "6月", + "1", + "2", + "日" + ], + "meta_keywords": [ + "北朝鮮政府", + "中朝関係", + "米国政府", + "習近平", + "金正恩", + "温家宝", + "トランプ", + "蔡英文", + "李克強", + "父の日", + "マリーナベイ・サンズ", + "米朝ホットライン開設", + "中国", + "ワシントン", + "中国国際航空" + ], + "tags": null, + "authors": [ + "日本経済新聞社" + ], + "publish_date": "2018-06-19T05:50:00+09:00", + "summary": "6月12日の歴史的な米朝首脳会談を巡って戦っていたのは、米大統領のトランプと北朝鮮委員長の金正恩(キム・ジョンウン)ばかりではなかった。中国国家主席、習近平(シー・ジンピン)も重要な陰の参戦者だった。核放棄を巡って「検証可能な」の文字がなく、米韓軍事演習の中止にまで踏み込んだ米朝会談は、トランプによる金正恩への一方的譲歩であり、後ろ盾だった中国の習近平も「勝ち組」になったかに見える。 その判断は…", + "meta_description": "6月12日の歴史的な米朝首脳会談を巡って戦っていたのは、米大統領のトランプと北朝鮮委員長の金正恩(キム・ジョンウン)ばかりではなかった。中国国家主席、習近平(シー・ジンピン)も重要な陰の参戦者だった", + "meta_lang": "ja", + "meta_favicon": "//assets.nikkei.jp/release/v3.1.23/parts/ds/images/ico/pin_favicon.ico", + "meta_site_name": "日本経済新聞 電子版", + "canonical_link": "https://www.nikkei.com/article/DGXMZO31897660Y8A610C1000000/", + "text": "6月12日の歴史的な米朝首脳会談を巡って戦っていたのは、米大統領のトランプと北朝鮮委員長の金正恩(キム・ジョンウン)ばかりではなかった。中国国家主席、習近平(シー・ジンピン)も重要な陰の参戦者だった。核放棄を巡って「検証可能な」の文字がなく、米韓軍事演習の中止にまで踏み込んだ米朝会談は、トランプによる金正恩への一方的譲歩であり、後ろ盾だった中国の習近平も「勝ち組」になったかに見える。\n\nその判断は…", + "text_cleaned": "6月12日の歴史的な米朝首脳会談を巡って戦っていたのは、米大統領のトランプと北朝鮮委員長の金正恩(キム・ジョンウン)ばかりではなかった。中国国家主席、習近平(シー・ジンピン)も重要な陰の参戦者だった。核放棄を巡って「検証可能な」の文字がなく、米韓軍事演習の中止にまで踏み込んだ米朝会談は、トランプによる金正恩への一方的譲歩であり、後ろ盾だった中国の習近平も「勝ち組」になったかに見える。\n\nその判断は…" +} \ No newline at end of file diff --git a/tests/data/metadata/japanese_article2.json b/tests/data/metadata/japanese_article2.json new file mode 100644 index 0000000..eb837f1 --- /dev/null +++ b/tests/data/metadata/japanese_article2.json @@ -0,0 +1,69 @@ +{ + "url": "http://www.afpbb.com/articles/-/3178894", + "read_more_link": "", + "language": "ja", + "title": "動画:ナイジェリアで少女らによる自爆攻撃、ロケット弾も撃ち込まれ31人死亡", + "top_image": "http://afpbb.ismcdn.jp/mwimgs/7/8/1000x/img_78b12987efed76066134014edcd4ba45104529.jpg", + "meta_img": "http://afpbb.ismcdn.jp/mwimgs/7/8/1000x/img_78b12987efed76066134014edcd4ba45104529.jpg", + "images": [ + "http://afpbb.ismcdn.jp/mwimgs/7/4/120x100/img_748985d8f6320630ca42d044b55a8ff7102635.jpg", + "http://afpbb.ismcdn.jp/mwimgs/f/2/120x100/img_f2b1c8fd9dd1eae1993ae48b34d20729106383.jpg", + "http://afpbb.ismcdn.jp/mwimgs/8/0/120x100/img_807a2241c52290ca8c6ee3392c13300463857.jpg", + "http://afpbb.ismcdn.jp/mwimgs/0/6/120x100/img_065f69254775f4a96ad66fa4d1c15ce246628.jpg", + "http://afpbb.ismcdn.jp/mwimgs/1/e/83x60/img_1ed85a9b2f9d257ded991014e1c057665550.jpg", + "http://afpbb.ismcdn.jp/mwimgs/8/9/83x60/img_89e83142be1259d0beb330202949ef165303.jpg", + "http://afpbb.ismcdn.jp/mwimgs/6/c/83x60/img_6cf5e49935df36ff278785f504e67daa4327.jpg" + ], + "movies": [], + "keywords": [ + "動画", + ":", + "撃ち込ま", + "、", + "1", + "攻撃", + "自爆", + "人", + "少女", + "ロケット", + "弾", + "ナイジェリア", + "3", + "死亡", + "。", + "者", + "・", + "「", + "」", + "(", + ")", + "ボコ", + "ハラム", + "\n\n", + " ", + "日", + "AFP", + "6", + "地元", + "明らか", + "死傷", + "北東部", + "ボルノ", + "州", + "イスラム" + ], + "meta_keywords": [ + "" + ], + "tags": null, + "authors": [], + "publish_date": "2018-06-18T13:57:57+00:00", + "summary": "【6月18日 AFP】ナイジェリア北東部ボルノ(Borno)州で16日夜、イスラム過激派組織「ボコ・ハラム(Boko Haram)」によるものと思われる少女を使った自爆攻撃が起き、31人が死亡した。地元当局者と民兵指導者が17日、AFPに明らかにした。 攻撃はボルノ州の町ダンボア(Damboa)で、イスラム教の断食月「ラマダン(Ramadan)」明けの祭り「イード・アル・フィトル(Eid al-Fitr)」の祝いから帰る途中の人々を狙って起きたもので、ボコ・ハラムの犯行と思われる特徴があった。 数回の自爆の後、襲撃者らは自爆攻撃の現場に集まった群集の中に携行式ロケット弾を撃ち込み、死傷者がさらに増えた。 地元の民兵指導者は「昨夜ダンボアで2度の自爆攻撃とロケット弾による爆発があり、31人が死亡した。その他に数人が負傷した」と語り、ボコ・ハラムの犯行であることは明らかだと述べた。 地元当局者は「死傷者の大半は町の外から発射されたロケット弾によるものだった。事件後に、自爆攻撃が6人の少女によって実行されたことが明らかになった。救急隊が現場で6人の頭部を発見した。顔つきからして7歳から10歳までの少女だった」と述べた。 政府はボコ・ハラムが劣勢にあると繰り返し主張しているが、あるアナリストは「ボコ・ハラムはナイジェリア北東部で大量の死傷者を出す攻撃を行う意図と作戦能力を維持している」と指摘している。(c)AFP/Aminu ABUBAKAR", + "meta_description": "【6月18日 AFP】ナイジェリア北東部ボルノ(Borno)州で16日夜、イスラム過激派組織「ボコ・ハラム(Boko Haram)」によるものと思われる少女を使った自爆攻撃が起き、31人が死亡した。", + "meta_lang": "ja", + "meta_favicon": "http://afpbb.ismcdn.jp/common/images/favicon.ico", + "meta_site_name": "", + "canonical_link": "http://www.afpbb.com/articles/-/3178894", + "text": "【6月18日 AFP】ナイジェリア北東部ボルノ(Borno)州で16日夜、イスラム過激派組織「ボコ・ハラム(Boko Haram)」によるものと思われる少女を使った自爆攻撃が起き、31人が死亡した。地元当局者と民兵指導者が17日、AFPに明らかにした。\n\n攻撃はボルノ州の町ダンボア(Damboa)で、イスラム教の断食月「ラマダン(Ramadan)」明けの祭り「イード・アル・フィトル(Eid al-Fitr)」の祝いから帰る途中の人々を狙って起きたもので、ボコ・ハラムの犯行と思われる特徴があった。\n\n数回の自爆の後、襲撃者らは自爆攻撃の現場に集まった群集の中に携行式ロケット弾を撃ち込み、死傷者がさらに増えた。\n\n地元の民兵指導者は「昨夜ダンボアで2度の自爆攻撃とロケット弾による爆発があり、31人が死亡した。その他に数人が負傷した」と語り、ボコ・ハラムの犯行であることは明らかだと述べた。\n\n地元当局者は「死傷者の大半は町の外から発射されたロケット弾によるものだった。事件後に、自爆攻撃が6人の少女によって実行されたことが明らかになった。救急隊が現場で6人の頭部を発見した。顔つきからして7歳から10歳までの少女だった」と述べた。\n\n政府はボコ・ハラムが劣勢にあると繰り返し主張しているが、あるアナリストは「ボコ・ハラムはナイジェリア北東部で大量の死傷者を出す攻撃を行う意図と作戦能力を維持している」と指摘している。(c)AFP/Aminu ABUBAKAR", + "text_cleaned": "【6月18日 AFP】ナイジェリア北東部ボルノ(Borno)州で16日夜、イスラム過激派組織「ボコ・ハラム(Boko Haram)」によるものと思われる少女を使った自爆攻撃が起き、31人が死亡した。地元当局者と民兵指導者が17日、AFPに明らかにした。\n\n攻撃はボルノ州の町ダンボア(Damboa)で、イスラム教の断食月「ラマダン(Ramadan)」明けの祭り「イード・アル・フィトル(Eid al-Fitr)」の祝いから帰る途中の人々を狙って起きたもので、ボコ・ハラムの犯行と思われる特徴があった。\n\n数回の自爆の後、襲撃者らは自爆攻撃の現場に集まった群集の中に携行式ロケット弾を撃ち込み、死傷者がさらに増えた。\n\n地元の民兵指導者は「昨夜ダンボアで2度の自爆攻撃とロケット弾による爆発があり、31人が死亡した。その他に数人が負傷した」と語り、ボコ・ハラムの犯行であることは明らかだと述べた。\n\n地元当局者は「死傷者の大半は町の外から発射されたロケット弾によるものだった。事件後に、自爆攻撃が6人の少女によって実行されたことが明らかになった。救急隊が現場で6人の頭部を発見した。顔つきからして7歳から10歳までの少女だった」と述べた。\n\n政府はボコ・ハラムが劣勢にあると繰り返し主張しているが、あるアナリストは「ボコ・ハラムはナイジェリア北東部で大量の死傷者を出す攻撃を行う意図と作戦能力を維持している」と指摘している。(c)AFP/Aminu ABUBAKAR" +} \ No newline at end of file diff --git a/tests/data/metadata/spanish_article.json b/tests/data/metadata/spanish_article.json old mode 100755 new mode 100644 index 3c3373a..ce35bf4 --- a/tests/data/metadata/spanish_article.json +++ b/tests/data/metadata/spanish_article.json @@ -6,77 +6,48 @@ "top_image": "", "meta_img": "", "images": [ - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=15&cb=1094933012&n=ac87e5c3", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=15&cb=354397435&n=ac87e5c3", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=15&cb=994141256&n=ac87e5c3", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=15&cb=835116037&n=ac87e5c3", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=15&cb=458852135&n=ac87e5c3", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=163&cb=135733613&n=ac87e5c3", "http://uh.gsstatic.es/images/iconos/preload.gif", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/158666/width/515/height/337.png", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158666.png", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158627.png", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=102&cb=787150556&n=ac87e5c3", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158613.png", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158674.png", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158625.png", - "http://ultimahora.es/sfAttachPlugin/getPreview/id/158658.png", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=169&cb=126739943&n=ac87e5c3", - "http://ultimahora.es/images/layout/back_hoy_mallorca.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/158151/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/154270/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/156640/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/151696/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/148926/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/137820/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/153758/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/156074/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/158229/width/286/height/163/crop/1.png", - "http://ultimahora.es/sfAttachPlugin/getCachedCropContent/id/110226.png", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=8&cb=200970506&n=ac87e5c3", - "http://ultimahora.es/blog/portada/id/24.png", - "http://ultimahora.es/blog/portada/id/28.png", - "http://banners.evoluhcion.es/www/delivery/avw.php?zoneid=13&cb=1031302047&n=ac87e5c3" + "http://ultimahora.es/sfAttachPlugin/getCachedContent/id/158666/width/515/height/337.png" ], "movies": [ "http://www.facebook.com/plugins/like.php?href=http%3A%2F%2Fultimahora.es%2Fmallorca%2Fnoticia%2Fnoticias%2Flocal%2Ffiscalia-anticorrupcion-estudia-recurre-imputacion-infanta.html&layout=button_count&show_faces=false&width=120&action=like&font&colorscheme=light&height=21" ], "keywords": [ - "recurre", - "madrid", "decidirá", + "madrid", + "recurre", "infanta", "imputación", - "si", "anticorrupción", + "si", "castro", "auto", + "horrach", "juez", "palma", - "horrach", + "cristina", "audiencia", - "urdangarin", - "través", - "sé", + "fiscal", + "antonio", "salinas", - "rey", - "relación", - "recursos", + "imputada", + "delitos", "postura", - "pasado", - "nóos", + "hija", "menor", - "iñaki", - "instructor", + "rey", "indicios", - "imputada", + "instructor", + "caso", + "nóos", "hizo", - "hija", - "fiscal", - "delitos", - "decisión", - "cristina", - "caso" + "pasado", + "ayer", + "abril", + "anuló", + "sé", + "relación", + "decisión" ], "meta_keywords": [ "Fiscalía Anticorrupción", @@ -97,5 +68,5 @@ "meta_site_name": "", "canonical_link": null, "text": "El fiscal Pedro Horrach está perfilando con el jefe de la Fiscalía Anticorrupción, Antonio Salinas, en Madrid, la estrategia jurídica que seguirá tras el auto dictado por el juez José Castro en el que acuerda citar en calidad de imputada a la infanta Cristina de Borbón y Grecia el próximo 8 de marzo en Palma por los delitos de blanqueo de capitales y contra la Hacienda Pública.\n\nHorrach duda si insistir en su postura de dejar al margen a la hija menor del Rey de los indicios incriminatorios citados por el instructor del ‘caso Nóos’, como ya lo hizo por escrito el pasado mes de noviembre o, por el contrario, se limita exponer de manera sucinta su punto de vista para apoyar el recurso que ya ha anunciado el abogado de la Infanta, Miquel Roca.\n\nConsenso\n\nEl fiscal anticorrupción se entrevistó ayer con su superior, Antonio Salinas, con el objetivo de analizar el extenso auto – 227 folios– de Castro y consensuar una postura común. En principio, Horrach no está dispuesto a entrar al trapo de las argumentaciones del titular del Juzgado de Instrucción 3 de Palma, que le reprocha en su resolución que defienda a ultranza a la Infanta y no haga lo mismo, por ejemplo, con otros imputados. «Las meras conjeturas o sospechas no constituyen elementos válidos de imputación», sostenía Horrach, que ya había recurrido el pasado abril contra la primera imputación de la infanta, que anuló la Audiencia de Palma.\n\nAsimismo, el juez Castro realizó ayer por la mañana unas breves declaraciones sobre su auto ante un grupo de periodistas a su llegada a los juzgados de Vía Alemania. «Uno lo hace para que se valide, pero luego si se valida o no, no lo sé», dijo el instructor del ‘caso Noos’ en relación a la decisión que debe adoptar la Audiencia de Palma sobre los previsibles recursos anunciados contra la imputación de la infanta Cristina.\n\n«No lo sé», manifestó Castro, que al ser preguntado por la presentación de recursos contra la imputación, declaró: «No estoy en la intención de ellos», en alusión a las partes. En relación a la reacción de la Casa Real al auto en el que llama a declarar a doña Cristina como imputada, Castro se limitó a manifestar «respeto también».\n\nEl juez señala en el auto que hay indicios claros de que la hija menor del Rey pudo haber cometido delitos fiscales a través de la empresa Aizoon, que compartía al 50 % con su marido, Iñaki Urdangarin, una sociedad presuntamente creada para canalizar fondos obtenidos fraudulentamente a través del Instituto Nóos. Esta es la segunda ocasión en que el juez Castro imputa a la esposa de Iñaki Urdangarin. Ya lo hizo en abril, pero la Audiencia anuló su decisión.", - "text_cleaned": "El fiscal Pedro Horrach está perfilando con el jefe de la Fiscalía Anticorrupción, Antonio Salinas, en Madrid, la estrategia jurídica que seguirá tras el auto dictado por el juez José Castro en el que acuerda citar en calidad de imputada a la infanta Cristina de Borbón y Grecia el próximo 8 de marzo en Palma por los delitos de blanqueo de capitales y contra la Hacienda Pública.\n\nHorrach duda si insistir en su postura de dejar al margen a la hija menor del Rey de los indicios incriminatorios citados por el instructor del ‘caso Nóos’, como ya lo hizo por escrito el pasado mes de noviembre o, por el contrario, se limita exponer de manera sucinta su punto de vista para apoyar el recurso que ya ha anunciado el abogado de la Infanta, Miquel Roca.\n\nConsenso\n\nEl fiscal anticorrupción se entrevistó ayer con su superior, Antonio Salinas, con el objetivo de analizar el extenso auto – 227 folios– de Castro y consensuar una postura común. En principio, Horrach no está dispuesto a entrar al trapo de las argumentaciones del titular del Juzgado de Instrucción 3 de Palma, que le reprocha en su resolución que defienda a ultranza a la Infanta y no haga lo mismo, por ejemplo, con otros imputados. «Las meras conjeturas o sospechas no constituyen elementos válidos de imputación», sostenía Horrach, que ya había recurrido el pasado abril contra la primera imputación de la infanta, que anuló la Audiencia de Palma.\n\nAsimismo, el juez Castro realizó ayer por la mañana unas breves declaraciones sobre su auto ante un grupo de periodistas a su llegada a los juzgados de Vía Alemania. «Uno lo hace para que se valide, pero luego si se valida o no, no lo sé», dijo el instructor del ‘caso Noos’ en relación a la decisión que debe adoptar la Audiencia de Palma sobre los previsibles recursos anunciados contra la imputación de la infanta Cristina.\n\n«No lo sé», manifestó Castro, que al ser preguntado por la presentación de recursos contra la imputación, declaró: «No estoy en la intención de ellos», en alusión a las partes. En relación a la reacción de la Casa Real al auto en el que llama a declarar a doña Cristina como imputada, Castro se limitó a manifestar «respeto también».\n\nEl juez señala en el auto que hay indicios claros de que la hija menor del Rey pudo haber cometido delitos fiscales a través de la empresa Aizoon, que compartía al 50 % con su marido, Iñaki Urdangarin, una sociedad presuntamente creada para canalizar fondos obtenidos fraudulentamente a través del Instituto Nóos. Esta es la segunda ocasión en que el juez Castro imputa a la esposa de Iñaki Urdangarin. Ya lo hizo en abril, pero la Audiencia anuló su decisión." + "text_cleaned": "Visto 314 veces\n\nEl fiscal Pedro Horrach está perfilando con el jefe de la Fiscalía Anticorrupción, Antonio Salinas, en Madrid, la estrategia jurídica que seguirá tras el auto dictado por el juez José Castro en el que acuerda citar en calidad de imputada a la infanta Cristina de Borbón y Grecia el próximo 8 de marzo en Palma por los delitos de blanqueo de capitales y contra la Hacienda Pública.\n\nHorrach duda si insistir en su postura de dejar al margen a la hija menor del Rey de los indicios incriminatorios citados por el instructor del ‘caso Nóos’, como ya lo hizo por escrito el pasado mes de noviembre o, por el contrario, se limita exponer de manera sucinta su punto de vista para apoyar el recurso que ya ha anunciado el abogado de la Infanta, Miquel Roca.\n\nConsenso\n\nEl fiscal anticorrupción se entrevistó ayer con su superior, Antonio Salinas, con el objetivo de analizar el extenso auto – 227 folios– de Castro y consensuar una postura común. En principio, Horrach no está dispuesto a entrar al trapo de las argumentaciones del titular del Juzgado de Instrucción 3 de Palma, que le reprocha en su resolución que defienda a ultranza a la Infanta y no haga lo mismo, por ejemplo, con otros imputados. «Las meras conjeturas o sospechas no constituyen elementos válidos de imputación», sostenía Horrach, que ya había recurrido el pasado abril contra la primera imputación de la infanta, que anuló la Audiencia de Palma.\n\nAsimismo, el juez Castro realizó ayer por la mañana unas breves declaraciones sobre su auto ante un grupo de periodistas a su llegada a los juzgados de Vía Alemania. «Uno lo hace para que se valide, pero luego si se valida o no, no lo sé», dijo el instructor del ‘caso Noos’ en relación a la decisión que debe adoptar la Audiencia de Palma sobre los previsibles recursos anunciados contra la imputación de la infanta Cristina.\n\n«No lo sé», manifestó Castro, que al ser preguntado por la presentación de recursos contra la imputación, declaró: «No estoy en la intención de ellos», en alusión a las partes. En relación a la reacción de la Casa Real al auto en el que llama a declarar a doña Cristina como imputada, Castro se limitó a manifestar «respeto también».\n\nEl juez señala en el auto que hay indicios claros de que la hija menor del Rey pudo haber cometido delitos fiscales a través de la empresa Aizoon, que compartía al 50 % con su marido, Iñaki Urdangarin, una sociedad presuntamente creada para canalizar fondos obtenidos fraudulentamente a través del Instituto Nóos. Esta es la segunda ocasión en que el juez Castro imputa a la esposa de Iñaki Urdangarin. Ya lo hizo en abril, pero la Audiencia anuló su decisión." } \ No newline at end of file diff --git a/tests/data/metadata/thai_article.json b/tests/data/metadata/thai_article.json new file mode 100644 index 0000000..7302a4b --- /dev/null +++ b/tests/data/metadata/thai_article.json @@ -0,0 +1,65 @@ +{ + "url": "https://prachatai.com/journal/2019/01/80642", + "read_more_link": "", + "language": "th", + "title": "ผล DNA ยืนยัน ศพลอยแม่น้ำโขงเป็นคนสนิท อ.สุรชัย", + "top_image": "https://farm8.staticflickr.com/7890/46097871864_6f2f471e68_k_d.jpg", + "meta_img": "", + "images": [ + "https://farm8.staticflickr.com/7890/46097871864_6f2f471e68_k_d.jpg", + "https://c2.staticflickr.com/2/1799/42011955610_e1a5914e8c_o.png", + "https://c1.staticflickr.com/5/4533/38536140181_90867cd984_z.jpg" + ], + "movies": [], + "keywords": [ + " ", + "ยืนยัน", + "ศพ", + "ลอย", + "แม่น้ำ", + "โขง", + "อ.", + "สุรชัย", + "DNA", + "คนสนิท", + "\n", + "นาย", + "(", + ")", + "ภู", + "ชนะ", + "'", + "ด่าน", + "หาย", + "ตรวจสอบ", + "เพื่อนบ้าน", + "ทราบ", + "กาสะ", + "ลอง", + "ปี", + "2561", + "เวลา", + "ผู้สื่อข่าว", + "ต.", + "บุตรชาย", + "แซ่", + "จน", + "ติด", + "วัฒนา", + "นุ" + ], + "meta_keywords": [ + "" + ], + "tags": null, + "authors": [], + "publish_date": null, + "summary": "21 ม.ค.2561 เวลาประมาณ 12.00 น.\nผู้สื่อข่าวได้รับแจ้งจากนาย ต.\n(ไม่ประสงค์เปิดเผยชื่อ) บุตรชายของ 'ภูชนะ' คนสนิทของนายสุรชัย แซ่ด่าน ที่หายไปว่า ผลการตรวจ DNA โดยใช้เนื้อเยื่อของศพที่ถูกสังหารด้วยการมัดแขน รัดคอ ทุบจนใบหน้าเละ และท้องถูกผ่ายัดเสาปูนที่ลอยมาติดที่ อ.ธาตุพนม จ.นครพนม นั้น เมื่อตรวจสอบแล้วมีความเกี่ยวพันทางสายเลือดกับนาย ต.จริง ภูชนะ (นามแฝง) เป็นคนใกล้ชิดของนายสุรชัย ด่านวัฒนานุสรณ์ หรือ สุรชัย แซ่ด่าน นักเคลื่อนไหวทางการเมืองที่ลี้ภัยออกจากประเทศไทยและได้หายตัวไปจากที่พักในประเทศเพื่อนบ้านในช่วงคืนวันที่ 12-13 ธ.ค.2561 พร้อมกันกับสุรชัยและคนสนิทอีกคนหนึ่ง ปรานี ด่านวัฒนานุสรณ์ ภรรยาของสุรชัยกล่าวว่า ได้ทราบข่าวจากบุตรชายของภูชนะแล้ว แต่ยังไม่ขอพูดอะไร โดยเบื้องต้นได้ทำใจแต่แรกแล้วว่าเหตุการณ์ลักษณะนี้จะต้องเกิดขึ้นสักวันหนึ่ง ขณะที่ นาย ว.\nพี่เขยของ 'กาสะลอง' อีกหนึ่งผู้ลี้ภัยที่ได้หายไปพร้อมกับนายสุรชัยกล่าวกับผู้สื่อข่าวว่า หลังจากได้ทราบผลการตรวจสอบ DNA จากลูกชายของภูชนะ ทางญาติของกาสะลองก็ได้ติดต่อไปที่พนักงานสอบสวนเจ้าของคดีและได้คำตอบว่า จะได้ทราบผลการตรวจสอบ DNA ภายในเวลา 2-3 วันนี้ สุรชัย (78 ปี) ภูชนะ (54 ปี) กาสะลอง (47 ปี) เป็นนักเคลื่อนไหวทางการเมือง และเป็นผู้ลี้ภัยจากเหตุการณ์รัฐประหาร 2557 ไปยังประเทศเพื่อนบ้านได้หายออกจากที่พักในประเทศเพื่อนบ้านโดยที่ไม่มีใครสามารถติดต่อได้จนปัจจุบันนับเป็นเวลานาน 1 เดือน กับอีก 10 วัน", + "meta_description": "", + "meta_lang": "th", + "meta_favicon": "https://prachatai.com/sites/default/files/pct_index.jpg", + "meta_site_name": "", + "canonical_link": "https://prachatai.com/journal/2019/01/80642", + "text": "21 ม.ค.2561 เวลาประมาณ 12.00 น. ผู้สื่อข่าวได้รับแจ้งจากนาย ต.(ไม่ประสงค์เปิดเผยชื่อ) บุตรชายของ 'ภูชนะ' คนสนิทของนายสุรชัย แซ่ด่าน ที่หายไปว่า ผลการตรวจ DNA โดยใช้เนื้อเยื่อของศพที่ถูกสังหารด้วยการมัดแขน รัดคอ ทุบจนใบหน้าเละ และท้องถูกผ่ายัดเสาปูนที่ลอยมาติดที่ อ.ธาตุพนม จ.นครพนม นั้น เมื่อตรวจสอบแล้วมีความเกี่ยวพันทางสายเลือดกับนาย ต.จริง\n\nภูชนะ (นามแฝง) เป็นคนใกล้ชิดของนายสุรชัย ด่านวัฒนานุสรณ์ หรือ สุรชัย แซ่ด่าน นักเคลื่อนไหวทางการเมืองที่ลี้ภัยออกจากประเทศไทยและได้หายตัวไปจากที่พักในประเทศเพื่อนบ้านในช่วงคืนวันที่ 12-13 ธ.ค.2561 พร้อมกันกับสุรชัยและคนสนิทอีกคนหนึ่ง\n\nปรานี ด่านวัฒนานุสรณ์ ภรรยาของสุรชัยกล่าวว่า ได้ทราบข่าวจากบุตรชายของภูชนะแล้ว แต่ยังไม่ขอพูดอะไร โดยเบื้องต้นได้ทำใจแต่แรกแล้วว่าเหตุการณ์ลักษณะนี้จะต้องเกิดขึ้นสักวันหนึ่ง\n\nขณะที่ นาย ว. พี่เขยของ 'กาสะลอง' อีกหนึ่งผู้ลี้ภัยที่ได้หายไปพร้อมกับนายสุรชัยกล่าวกับผู้สื่อข่าวว่า หลังจากได้ทราบผลการตรวจสอบ DNA จากลูกชายของภูชนะ ทางญาติของกาสะลองก็ได้ติดต่อไปที่พนักงานสอบสวนเจ้าของคดีและได้คำตอบว่า จะได้ทราบผลการตรวจสอบ DNA ภายในเวลา 2-3 วันนี้\n\nสุรชัย (78 ปี) ภูชนะ (54 ปี) กาสะลอง (47 ปี) เป็นนักเคลื่อนไหวทางการเมือง และเป็นผู้ลี้ภัยจากเหตุการณ์รัฐประหาร 2557 ไปยังประเทศเพื่อนบ้านได้หายออกจากที่พักในประเทศเพื่อนบ้านโดยที่ไม่มีใครสามารถติดต่อได้จนปัจจุบันนับเป็นเวลานาน 1 เดือน กับอีก 10 วัน", + "text_cleaned": "21 ม.ค.2561 เวลาประมาณ 12.00 น. ผู้สื่อข่าวได้รับแจ้งจากนาย ต.(ไม่ประสงค์เปิดเผยชื่อ) บุตรชายของ 'ภูชนะ' คนสนิทของนายสุรชัย แซ่ด่าน ที่หายไปว่า ผลการตรวจ DNA โดยใช้เนื้อเยื่อของศพที่ถูกสังหารด้วยการมัดแขน รัดคอ ทุบจนใบหน้าเละ และท้องถูกผ่ายัดเสาปูนที่ลอยมาติดที่ อ.ธาตุพนม จ.นครพนม นั้น เมื่อตรวจสอบแล้วมีความเกี่ยวพันทางสายเลือดกับนาย ต.จริง\n\nภูชนะ (นามแฝง) เป็นคนใกล้ชิดของนายสุรชัย ด่านวัฒนานุสรณ์ หรือ สุรชัย แซ่ด่าน นักเคลื่อนไหวทางการเมืองที่ลี้ภัยออกจากประเทศไทยและได้หายตัวไปจากที่พักในประเทศเพื่อนบ้านในช่วงคืนวันที่ 12-13 ธ.ค.2561 พร้อมกันกับสุรชัยและคนสนิทอีกคนหนึ่ง\n\nปรานี ด่านวัฒนานุสรณ์ ภรรยาของสุรชัยกล่าวว่า ได้ทราบข่าวจากบุตรชายของภูชนะแล้ว แต่ยังไม่ขอพูดอะไร โดยเบื้องต้นได้ทำใจแต่แรกแล้วว่าเหตุการณ์ลักษณะนี้จะต้องเกิดขึ้นสักวันหนึ่ง\n\nขณะที่ นาย ว. พี่เขยของ 'กาสะลอง' อีกหนึ่งผู้ลี้ภัยที่ได้หายไปพร้อมกับนายสุรชัยกล่าวกับผู้สื่อข่าวว่า หลังจากได้ทราบผลการตรวจสอบ DNA จากลูกชายของภูชนะ ทางญาติของกาสะลองก็ได้ติดต่อไปที่พนักงานสอบสวนเจ้าของคดีและได้คำตอบว่า จะได้ทราบผลการตรวจสอบ DNA ภายในเวลา 2-3 วันนี้\n\nสุรชัย (78 ปี) ภูชนะ (54 ปี) กาสะลอง (47 ปี) เป็นนักเคลื่อนไหวทางการเมือง และเป็นผู้ลี้ภัยจากเหตุการณ์รัฐประหาร 2557 ไปยังประเทศเพื่อนบ้านได้หายออกจากที่พักในประเทศเพื่อนบ้านโดยที่ไม่มีใครสามารถติดต่อได้จนปัจจุบันนับเป็นเวลานาน 1 เดือน กับอีก 10 วัน" +} \ No newline at end of file diff --git a/tests/data/metadata/time_001.json b/tests/data/metadata/time_001.json old mode 100755 new mode 100644 index ead8e8f..47ac665 --- a/tests/data/metadata/time_001.json +++ b/tests/data/metadata/time_001.json @@ -6,13 +6,13 @@ "top_image": "https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85", "meta_img": "https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85", "images": [ - "https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=2400" + "https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=640 640w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=750 750w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=828 828w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=1080 1080w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=1200 1200w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=1440 1440w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=1690 1690w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=1920 1920w, https://api.time.com/wp-content/uploads/2023/11/climate-change-study-geoengineering-james-hansen.jpg?quality=85&w=2400 2400w" ], "movies": [], "keywords": [ - "solution", "james", "calls", + "solution", "climate", "geoengineering", "hansen", @@ -20,31 +20,31 @@ "scientists", "emissions", "dioxide", - "sulfur", - "paper", "mann", - "dangerous", "controversial", - "2c", - "writes", - "university", - "solar", - "reduce", - "recent", - "models", - "hansens", - "greenhouse", - "conclusions", + "paper", + "2°c", + "dangerous", + "sulfur", "change", "carbon", "aerosols", + "atmosphere", + "recent", + "reduce", + "greenhouse", + "university", + "solar", + "conclusions", + "models", + "writes", + "hansen’s", + "published", "warning", - "unprecedented", "underestimating", - "sunlight", - "substantially", - "situation", - "scientist" + "planet", + "earth", + "5°c" ], "meta_keywords": [ "" @@ -61,5 +61,5 @@ "meta_site_name": "TIME", "canonical_link": "https://time.com/6330957/james-hansen-climate-warning-geoengineering-study/", "text": "James Hansen first warned Congress of the threat from climate change in 1988. Today, in a controversial new peer-reviewed paper published in Oxford Open Climate Change, he brings a new warning: Scientists are underestimating how fast the planet is warming. And the crisis will have to be met, in part, with geoengineering.\n\nAccording to the report, earth will pass 1.5°C of cumulative warming this decade, and exceed 2°C of warming before 2050. Scientists think that warming in excess of 2°C could unleash more dangerous effects, like the collapse of Antarctic ice sheets, leading to rapid sea level rise. Limiting warming to 1.5°C, and at least keeping it well under 2°C, is the goal of the Paris Climate Accord, with international policymakers gathering at yearly COP meetings to negotiate actions to meet that goal.\n\nClimate scientists have been underestimating how sensitive the global climate system will be to increased carbon dioxide emissions, according to the new paper. That’s in part because they have been improperly accounting for the effect of sulfur dioxide emissions from coal power plants and ships burning bunker fuel, which mask warming. Sulfur dioxide emissions, in the form of aerosols in the atmosphere, have the effect of reflecting sunlight, but they are also hazardous to human health. In recent years, regulations around the world have caused sulfur dioxide emissions to fall. That’s likely helped reduce air pollution responsible for millions of deaths every year, but, according to Hansen, the tradeoff has been accelerated warming. This, he says, is part of the reason for the record warming much of the Northern Hemisphere experienced this summer.\n\n“Humanity made a Faustian bargain by offsetting a substantial but uncertain fraction of greenhouse gas warming with aerosol cooling,” Hansen said alongside other scientists in a webinar introducing his new paper on Nov. 2. “Now, as we want to reduce all the chronic health effects of aerosols, our first Faustian payment is due.”\n\nRead more: A Controversial Technology Is Creating an Unprecedented Rift Among Climate Scientists\n\nHansen, an adjunct professor of earth and environmental sciences at Columbia University, has often been a critic of mainstream climate policy. In 2015, after the Paris Climate Accord was adopted, he labeled the agreement “a fraud” because it did not include mandates to tax fossil fuels in order to discourage them from being burned.\n\nEmissions cuts alone will not be enough to ensure a safe climate in future years, according to Hansen and his collaborators. Governments will have to impose carbon fees to help rapidly draw down emissions, they argue, adding it will also be necessary to research and deploy techniques to reduce incoming solar radiation, also known as solar geoengineering.\n\nSuch methods, which can include releasing sulfur dioxide into the stratosphere, or spraying seawater into the air to form clouds, are highly controversial, with many researchers warning of dangerous, potentially unintended consequences. Currently, the main debate among the largest body of climate scientists is whether to even begin funding research into geoengineering in case humanity needs it, or if putting the option on the table is too dangerous to consider. Few hold the position, as argued in the new paper, that geoengineering deployment will definitely be necessary.\n\n“The 2°C warming limit is dead, unless we take purposeful actions to alter the earth’s energy imbalance,” Hansen said in the webinar.\n\n“We have to recognize we're geoengineering the planet right now [with greenhouse gasses],” Hansen added later, in response to a question from TIME. “We have to minimize that human-made geoengineering. And, on a temporary timescale, that will probably require reflecting sunlight, just because of how difficult it is to get the greenhouse gasses out of the atmosphere.”\n\nRead more: Inside a Controversial Startup's Risky Attempt to Control Our Climate\n\nHanson and his collaborators’ findings on the severity of warming and the necessity for geoengineering fall largely outside the broad conclusions made by international climate scientists from the U.N. Intergovernmental Panel on Climate Change. In part, they result from a reassessment of prehistoric climate data. Those interpretations are disputed by Michael Mann, a well-known climate scientist at Pennsylvania State University, in a commentary published on Nov. 1.\n\n“I come away with very different conclusions about what we collectively learn from the Cenozoic, cooling, the Pliocene, and the Holocene. Basically, they tell us that climate models have the climate sensitivity… about right,” Mann writes. He also disputes Hansen’s conclusions that recent shipping emission rules have substantially affected recent warming.\n\nMann calls Hansen’s arguments for the necessity of solar geoengineering, “misguided policy advocacy.”\n\n“The authors are promoting the unprecedented, and potentially very dangerous, ‘geoengineeering’ gambit of attempting to manipulate our planetary environment,” Mann writes. “This desperate action is motivated by what I consider to be a fallacy, advanced by the article, that large-scale warming will be substantially greater than current-generation models project.” (Current models say that carbon dioxide emissions will warm the atmosphere a lot less compared to Hansen’s analysis.)\n\nAccording to Mann, the climate situation is still extremely dire. But it is a situation that concerted efforts to decarbonize our economy can address, without resorting to geoengineering.\n\nOther scientists are softer on the paper’s temperature predictions, but echo Mann’s concerns about the dangers of geoengineering.\n\n“Much of Jim’s analysis is credible,” writes Michael Oppenheimer, a climate scientist at Princeton University, over email. “However, I disagree on the solution: using [aerosols] to geoengineer the climate. No outdoor experiments should be done and no implementation of this approach considered until there is a framework for global governance of geoengineering of this type in place, agreed to by a broad swath of countries.”", - "text_cleaned": "James Hansen first warned Congress of the threat from climate change in 1988. Today, in a controversial new peer-reviewed paper published in Oxford Open Climate Change, he brings a new warning: Scientists are underestimating how fast the planet is warming. And the crisis will have to be met, in part, with geoengineering.\n\nAccording to the report, earth will pass 1.5°C of cumulative warming this decade, and exceed 2°C of warming before 2050. Scientists think that warming in excess of 2°C could unleash more dangerous effects, like the collapse of Antarctic ice sheets, leading to rapid sea level rise. Limiting warming to 1.5°C, and at least keeping it well under 2°C, is the goal of the Paris Climate Accord, with international policymakers gathering at yearly COP meetings to negotiate actions to meet that goal.\n\nClimate scientists have been underestimating how sensitive the global climate system will be to increased carbon dioxide emissions, according to the new paper. That’s in part because they have been improperly accounting for the effect of sulfur dioxide emissions from coal power plants and ships burning bunker fuel, which mask warming. Sulfur dioxide emissions, in the form of aerosols in the atmosphere, have the effect of reflecting sunlight, but they are also hazardous to human health. In recent years, regulations around the world have caused sulfur dioxide emissions to fall. That’s likely helped reduce air pollution responsible for millions of deaths every year, but, according to Hansen, the tradeoff has been accelerated warming. This, he says, is part of the reason for the record warming much of the Northern Hemisphere experienced this summer.\n\n“Humanity made a Faustian bargain by offsetting a substantial but uncertain fraction of greenhouse gas warming with aerosol cooling,” Hansen said alongside other scientists in a webinar introducing his new paper on Nov. 2. “Now, as we want to reduce all the chronic health effects of aerosols, our first Faustian payment is due.”\n\nRead more: A Controversial Technology Is Creating an Unprecedented Rift Among Climate Scientists\n\nHansen, an adjunct professor of earth and environmental sciences at Columbia University, has often been a critic of mainstream climate policy. In 2015, after the Paris Climate Accord was adopted, he labeled the agreement “a fraud” because it did not include mandates to tax fossil fuels in order to discourage them from being burned.\n\nEmissions cuts alone will not be enough to ensure a safe climate in future years, according to Hansen and his collaborators. Governments will have to impose carbon fees to help rapidly draw down emissions, they argue, adding it will also be necessary to research and deploy techniques to reduce incoming solar radiation, also known as solar geoengineering.\n\nSuch methods, which can include releasing sulfur dioxide into the stratosphere, or spraying seawater into the air to form clouds, are highly controversial, with many researchers warning of dangerous, potentially unintended consequences. Currently, the main debate among the largest body of climate scientists is whether to even begin funding research into geoengineering in case humanity needs it, or if putting the option on the table is too dangerous to consider. Few hold the position, as argued in the new paper, that geoengineering deployment will definitely be necessary.\n\n“The 2°C warming limit is dead, unless we take purposeful actions to alter the earth’s energy imbalance,” Hansen said in the webinar.\n\n“We have to recognize we're geoengineering the planet right now [with greenhouse gasses],” Hansen added later, in response to a question from TIME. “We have to minimize that human-made geoengineering. And, on a temporary timescale, that will probably require reflecting sunlight, just because of how difficult it is to get the greenhouse gasses out of the atmosphere.”\n\nRead more: Inside a Controversial Startup's Risky Attempt to Control Our Climate\n\nHanson and his collaborators’ findings on the severity of warming and the necessity for geoengineering fall largely outside the broad conclusions made by international climate scientists from the U.N. Intergovernmental Panel on Climate Change. In part, they result from a reassessment of prehistoric climate data. Those interpretations are disputed by Michael Mann, a well-known climate scientist at Pennsylvania State University, in a commentary published on Nov. 1.\n\n“I come away with very different conclusions about what we collectively learn from the Cenozoic, cooling, the Pliocene, and the Holocene. Basically, they tell us that climate models have the climate sensitivity… about right,” Mann writes. He also disputes Hansen’s conclusions that recent shipping emission rules have substantially affected recent warming.\n\nMann calls Hansen’s arguments for the necessity of solar geoengineering, “misguided policy advocacy.”\n\n“The authors are promoting the unprecedented, and potentially very dangerous, ‘geoengineeering’ gambit of attempting to manipulate our planetary environment,” Mann writes. “This desperate action is motivated by what I consider to be a fallacy, advanced by the article, that large-scale warming will be substantially greater than current-generation models project.” (Current models say that carbon dioxide emissions will warm the atmosphere a lot less compared to Hansen’s analysis.)\n\nAccording to Mann, the climate situation is still extremely dire. But it is a situation that concerted efforts to decarbonize our economy can address, without resorting to geoengineering.\n\nOther scientists are softer on the paper’s temperature predictions, but echo Mann’s concerns about the dangers of geoengineering.\n\n“Much of Jim’s analysis is credible,” writes Michael Oppenheimer, a climate scientist at Princeton University, over email. “However, I disagree on the solution: using [aerosols] to geoengineer the climate. No outdoor experiments should be done and no implementation of this approach considered until there is a framework for global governance of geoengineering of this type in place, agreed to by a broad swath of countries.”" + "text_cleaned": "Climateadaptation\n\nWe Need Geoengineering to Stop Out of Control Warming, Warns Climate Scientist James Hansen\n\nWe Need Geoengineering to Stop Out of Control Warming, Warns Climate Scientist James Hansen\n\nNovember 2, 2023 3:53 PM EDT\n\nJames Hansen first warned Congress of the threat from climate change in 1988. Today, in a controversial new peer-reviewed paper published in Oxford Open Climate Change, he brings a new warning: Scientists are underestimating how fast the planet is warming. And the crisis will have to be met, in part, with geoengineering.\n\nAccording to the report, earth will pass 1.5°C of cumulative warming this decade, and exceed 2°C of warming before 2050. Scientists think that warming in excess of 2°C could unleash more dangerous effects, like the collapse of Antarctic ice sheets, leading to rapid sea level rise. Limiting warming to 1.5°C, and at least keeping it well under 2°C, is the goal of the Paris Climate Accord, with international policymakers gathering at yearly COP meetings to negotiate actions to meet that goal.\n\nClimate scientists have been underestimating how sensitive the global climate system will be to increased carbon dioxide emissions, according to the new paper. That’s in part because they have been improperly accounting for the effect of sulfur dioxide emissions from coal power plants and ships burning bunker fuel, which mask warming. Sulfur dioxide emissions, in the form of aerosols in the atmosphere, have the effect of reflecting sunlight, but they are also hazardous to human health. In recent years, regulations around the world have caused sulfur dioxide emissions to fall. That’s likely helped reduce air pollution responsible for millions of deaths every year, but, according to Hansen, the tradeoff has been accelerated warming. This, he says, is part of the reason for the record warming much of the Northern Hemisphere experienced this summer.\n\n“Humanity made a Faustian bargain by offsetting a substantial but uncertain fraction of greenhouse gas warming with aerosol cooling,” Hansen said alongside other scientists in a webinar introducing his new paper on Nov. 2. “Now, as we want to reduce all the chronic health effects of aerosols, our first Faustian payment is due.”\n\nRead more: A Controversial Technology Is Creating an Unprecedented Rift Among Climate Scientists\n\nHansen, an adjunct professor of earth and environmental sciences at Columbia University, has often been a critic of mainstream climate policy. In 2015, after the Paris Climate Accord was adopted, he labeled the agreement “a fraud” because it did not include mandates to tax fossil fuels in order to discourage them from being burned.\n\nEmissions cuts alone will not be enough to ensure a safe climate in future years, according to Hansen and his collaborators. Governments will have to impose carbon fees to help rapidly draw down emissions, they argue, adding it will also be necessary to research and deploy techniques to reduce incoming solar radiation, also known as solar geoengineering.\n\nSuch methods, which can include releasing sulfur dioxide into the stratosphere, or spraying seawater into the air to form clouds, are highly controversial, with many researchers warning of dangerous, potentially unintended consequences. Currently, the main debate among the largest body of climate scientists is whether to even begin funding research into geoengineering in case humanity needs it, or if putting the option on the table is too dangerous to consider. Few hold the position, as argued in the new paper, that geoengineering deployment will definitely be necessary.\n\n“The 2°C warming limit is dead, unless we take purposeful actions to alter the earth’s energy imbalance,” Hansen said in the webinar.\n\n“We have to recognize we're geoengineering the planet right now [with greenhouse gasses],” Hansen added later, in response to a question from TIME. “We have to minimize that human-made geoengineering. And, on a temporary timescale, that will probably require reflecting sunlight, just because of how difficult it is to get the greenhouse gasses out of the atmosphere.”\n\nRead more: Inside a Controversial Startup's Risky Attempt to Control Our Climate\n\nHanson and his collaborators’ findings on the severity of warming and the necessity for geoengineering fall largely outside the broad conclusions made by international climate scientists from the U.N. Intergovernmental Panel on Climate Change. In part, they result from a reassessment of prehistoric climate data. Those interpretations are disputed by Michael Mann, a well-known climate scientist at Pennsylvania State University, in a commentary published on Nov. 1.\n\n“I come away with very different conclusions about what we collectively learn from the Cenozoic, cooling, the Pliocene, and the Holocene. Basically, they tell us that climate models have the climate sensitivity… about right,” Mann writes. He also disputes Hansen’s conclusions that recent shipping emission rules have substantially affected recent warming.\n\nMann calls Hansen’s arguments for the necessity of solar geoengineering, “misguided policy advocacy.”\n\n“The authors are promoting the unprecedented, and potentially very dangerous, ‘geoengineeering’ gambit of attempting to manipulate our planetary environment,” Mann writes. “This desperate action is motivated by what I consider to be a fallacy, advanced by the article, that large-scale warming will be substantially greater than current-generation models project.” (Current models say that carbon dioxide emissions will warm the atmosphere a lot less compared to Hansen’s analysis.)\n\nAccording to Mann, the climate situation is still extremely dire. But it is a situation that concerted efforts to decarbonize our economy can address, without resorting to geoengineering.\n\nOther scientists are softer on the paper’s temperature predictions, but echo Mann’s concerns about the dangers of geoengineering.\n\n“Much of Jim’s analysis is credible,” writes Michael Oppenheimer, a climate scientist at Princeton University, over email. “However, I disagree on the solution: using [aerosols] to geoengineer the climate. No outdoor experiments should be done and no implementation of this approach considered until there is a framework for global governance of geoengineering of this type in place, agreed to by a broad swath of countries.”\n\nMore Must-Reads From TIME\n\nThe Struggle to Save Lives Inside Gaza’s HospitalsVolodymyr Zelensky’s Struggle to Keep Ukraine in the FightSheikh Hasina and the Future of Democracy in BangladeshThe War Is Making Americans Question Their RelationshipsIs Using the Snooze Button Bad for You?Zooey Zephyr: The Love Story of My Chosen FamilyThe Best Inventions of 2023Want Weekly Recs on What to Watch, Read, and More? Sign Up for Worth Your Time\n\nWrite to Alejandro de la Garza at alejandro.delagarza@time.com\n\nEdit Post" } \ No newline at end of file diff --git a/tests/data/metadata/wired_001.json b/tests/data/metadata/wired_001.json index 9d3c150..c167bc2 100644 --- a/tests/data/metadata/wired_001.json +++ b/tests/data/metadata/wired_001.json @@ -11,8 +11,8 @@ ], "movies": [], "keywords": [ - "lockdown", "intensified", + "lockdown", "israeli", "surveillance", "west", @@ -20,32 +20,32 @@ "palestinians", "soldiers", "hebron", - "israels", - "israel", + "israel’s", "checkpoint", - "work", + "israel", "palestinian", - "wolf", - "technology", - "system", - "recognition", - "occupation", - "military", + "azza", + "settlers", + "work", + "days", "leave", "facial", + "recognition", + "system", "checkpoints", - "azza", - "weeks", + "technology", + "region", + "wolf", + "military", + "occupation", + "sector", + "neighborhood", + "camera", + "lives", "tel", - "technologies", - "systems", - "software", - "settlers", - "security", "rumeida", - "restrictions", - "recent", - "private" + "it’s", + "defense" ], "meta_keywords": [ "israel-hamas war", diff --git a/tests/data/metadata/yna_co_kr.json b/tests/data/metadata/yna_co_kr.json index d65a432..5eef882 100644 --- a/tests/data/metadata/yna_co_kr.json +++ b/tests/data/metadata/yna_co_kr.json @@ -26,40 +26,40 @@ "movies": [], "keywords": [ "korea", - "warns", - "slams", - "s", "n", + "slams", + "warns", "consequences", "military", "joint", + "s", "drill", "south", "exercise", "north", "seoul", - "washington", + "conducting", + "day", + "pay", "stupid", - "resume", + "acts", + "washington", "pyongyang", - "pay", - "paper", "invading", - "interkorean", - "day", - "conducting", + "paper", + "inter-korean", + "resume", + "aug", + "20", "yonhap", - "wrap", - "wont", - "weekslong", - "weeks", - "week", - "weapons", - "wartime", - "warnings", + "korea's", + "newspaper", + "blasted", + "united", + "states", "warning", - "warned", - "warmongers" + "dearly", + "wrap" ], "meta_keywords": [ "" diff --git a/tests/data/txt/hindi_article.txt b/tests/data/txt/hindi_article.txt new file mode 100644 index 0000000..fb4efee --- /dev/null +++ b/tests/data/txt/hindi_article.txt @@ -0,0 +1,31 @@ +22 जनवरी से सतयुग की शुरुआत…, स्वामी अधोक्षजानंद ने प्राण प्रतिष्ठा पर सवाल उठाने वालों को लपेटा, नसीहत में कह दी बड़ी बात + +Swami Adhokshajanand Tirthadev - गोवर्धन मठ के स्वामी अधोक्षजानंद तीर्थदेव ने राम मंदिर प्राण प्रतिष्ठा में धर्म गुरुओं के विरोध की बातों का खंडन किया है। उनका कहना है कि जो लोग भ्रामक अफवाह फैला रहे हैं समय आने पर उन्हें जवाब दिया जाएगा। 22 जनवरी का दिन सतयुग की शुरुआत जैसा होगा। पूरा देश इसे महापर्व की तरह मनाएगा। + +जागरण संवाददाता, आगरा। गोवर्धन मठ के स्वामी अधोक्षजानंद तीर्थदेव ने राम मंदिर प्राण प्रतिष्ठा में धर्म गुरुओं के विरोध की बातों का खंडन किया है। उनका कहना है कि जो लोग भ्रामक अफवाह फैला रहे हैं, समय आने पर उन्हें जवाब दिया जाएगा। 22 जनवरी का दिन सतयुग की शुरुआत जैसा होगा। पूरा देश इसे महापर्व की तरह मनाएगा। + +स्वामी अधोक्षजानंद तीर्थ देव संस्कृति और धर्म का प्रसार कर 18 देशों को अखंड भारत से वापस जोड़ने के उद्देश्य से अखंड भारत यात्रा पर निकले हैं। अरुणाचल से प्रयागराज एक माह के प्रवास पर जाने के दौरान उनका आगरा आगमन हुआ था। यहां एक भक्त के प्रतिष्ठान पर उन्होंने अल्पाहार लिया। + +पत्र जारी कर किया खंडन + +मीडिया से बातचीत में उन्होंने कहा कि धर्मगुरुओं के द्वारा राम मंदिर का आमंत्रण न स्वीकारने जैसी बातें सिर्फ अफवाह हैं। श्रृंगेरी मठ, द्वारिका मठ और गोवर्धन मठ द्वारा पत्र जारी कर इसका खंडन किया गया है। + +राम मंदिर के निमंत्रण की चर्चा नहीं होनी चाहिए। राम खुद धर्म हैं और उनके मंदिर की प्राण प्रतिष्ठा में जाने से कोई भला कैसे मना कर सकता है। सभी समय के अनुसार 22 जनवरी या उसके बाद रामलला के दर्शन करने जाएंगे। + +धर्म गुरुओं का काम धर्म और संस्कृति का प्रसार + +उनका कहना है कि धर्म गुरुओं का काम धर्म और संस्कृति का प्रसार करना है। वो भी यही कर रहे हैं। इसके लिए 19 नवंबर 2019 से अखंड भारत यात्रा पर निकले हैं। भारत के सभी राज्य 52 शक्तिपीठ और 12 ज्योतिर्लिंग के दर्शन कर रहे हैं। नेपाल, भूटान, कंबोडिया, बांग्लादेश समेत कई देश घूम आए हैं। + +राजनीति से दूर रहने की बात कही + +उनका मानना है की एक ज्योतिर्लिंग पाकिस्तान में है और राजनीति करने वाले आर्य उसे जल्द अखंड भारत में समाहित कर लेंगे। कुछ धर्म गुरुओं द्वारा राम मंदिर के मुहूर्त और निमंत्रण न स्वीकारने जैसी बातें बोलने के मामले पर उन्होंने ऐसे लोगों को धर्म विरोधी बताया है और राजनीति से दूर रहने की बात कही है। + +त्रेता युग जैसा है मुहूर्त + +स्वामी अधोक्षजानंद का कहना है कि राम मंदिर प्राण प्रतिष्ठा का मुहूर्त त्रेता युग में प्रभु श्री राम के जन्म के जैसा शुभ है। मंदिर का निर्माण अभी काफी समय तक चलेगा पर गर्भगृह तैयार होने के बाद रामलला की प्राण प्रतिष्ठा पूरी तरह तर्कसंगत है। + +यह भी पढ़ें: पीएम मोदी के आने से पहले अयोध्या पहुंची किट, बॉक्स पर लिखा है VIP Kit-PM, सोचिए इसमें क्या होगा? + +यह भी पढ़ें: आचार्य प्रमोद कृष्णम को मिला श्रीराम मंदिर प्राण प्रतिष्ठा का निमंत्रण, कल्कि धाम में मंदिर को लेकर कह दी ये बड़ी बात + +Jagran.com अब whatsapp चैनल पर भी उपलब्ध है। आज ही फॉलो करें और पाएं महत्वपूर्ण खबरें WhatsApp चैनल से जुड़ें \ No newline at end of file diff --git a/tests/data/txt/korean_article.txt b/tests/data/txt/korean_article.txt new file mode 100644 index 0000000..b21b199 --- /dev/null +++ b/tests/data/txt/korean_article.txt @@ -0,0 +1,15 @@ +최태원 "동거인에 6억 지출…노소영 1000억 증여 주장 허위" + +노소영 아트센터 나비 관장이 최태원 SK그룹 회장의 동거인 김희영 티앤씨재단 이사장을 상대로 낸 위자료 소송의 첫 정식 변론을 하루 앞두고 최 회장 측이 1000억원 증여 의혹을 전면 반박했다. + +최 회장의 대리인단은 17일 입장문을 통해 "노 관장 측이 언론에 근거로 제시한 자료는 최 회장 개인 부동산, 미술품 구입, 벤처 투자금, 사회공헌 기부금이 대부분"이라며 "이를 합산해 김 이사장에게 증여했다는 것은 억지 주장"이라고 말했다. + +대리인단은 "노 관장 측이 주장하는 금융자료는 2015년 이후 최 회장이 소유한 모든 계좌를 합한 것인데, 실제로 여기에서 8년간 김 이사장에게 지출된 금액은 6억1000만원"이라며 "김 이사장이 최 회장이 설립한 공익재단에서 무보수로 7년째 근무 중임을 생각하면 많은 금액이라 할 수 없다"고 주장했다. + +그러면서 "20년의 혼인 기간, 14년의 별거 기간 대부분 노 관장은 최 회장의 급여 전액을 본인 통장에 이체받아 사용했다"며 "현재 노 관장 명의 재산 가액이 드러난 것만 약 200억원인데 이는 최 회장 급여에 기반해 형성된 것"이라고 했다. + +이어 "노 관장 측 계산 방식에 따르면 금융 자료가 남아있는 것만 합산해도 노 관장이 최 회장으로부터 지원받은 돈은 최소 1140억여원"이라며 "더 이상 음해와 선동을 위한 언론플레이를 멈추길 촉구한다"고 강조했다. + +노 관장의 대리인은 작년 11월 김 이사장을 상대로 낸 위자료 소송을 마치고 취재진에 "최 회장이 김 이사장에게 쓴 돈이 1000억원이 넘는다"며 "간통 행위로 인해 상간녀가 취득한 이익이 크다면 이혼소송의 위자료 산정에도 고려돼야 한다"고 주장했다. + +18일 오후 위자료 소송의 첫 정식 변론이 열린다. 한편 이와 별개로 최 회장과 노 관장은 이혼 소송 중이며 현재 2심이 진행되는 과정에서 소송을 대리할 변호사 선임, 가정사 등을 놓고 장외 공방을 벌이고 있다. \ No newline at end of file diff --git a/tests/test_article.py b/tests/test_article.py index 7759062..157c448 100755 --- a/tests/test_article.py +++ b/tests/test_article.py @@ -19,11 +19,14 @@ def cnn_article(): ) html_content = conftest.get_data("cnn_article", "html") text_content = conftest.get_data("cnn_article", "txt") + json_content = conftest.get_data("cnn_article", "metadata") return { "url": url, "html_content": html_content, "text_content": text_content, + "summary": json_content["summary"], + "keywords": json_content["keywords"], } @@ -180,31 +183,13 @@ def test_meta_refresh(self, meta_refresh): assert article.title == title def test_article_nlp(self, cnn_article): - article = newspaper.Article( - cnn_article["url"], max_keywords=10, fetch_images=False - ) + article = newspaper.Article(cnn_article["url"], fetch_images=False) article.download(input_html=cnn_article["html_content"]) article.parse() article.nlp() - summary = conftest.get_data("cnn_summary", "txt") - summary = summary.strip() - - assert sorted(article.keywords) == sorted( - [ - "flight", - "forecasters", - "good", - "sailing", - "smooth", - "storm", - "thanksgiving", - "travel", - "weather", - "winds", - ] - ) - assert article.summary.strip() == summary + assert sorted(article.keywords) == sorted(cnn_article["keywords"]) + assert article.summary.strip() == cnn_article["summary"].strip() def test_download_inexisting_file(self): url = "file://" + str( diff --git a/tests/test_languages.py b/tests/test_languages.py index 878d3bf..227ee9d 100755 --- a/tests/test_languages.py +++ b/tests/test_languages.py @@ -3,6 +3,7 @@ import newspaper from newspaper import nlp from newspaper.article import Article +from newspaper.text import StopWords from tests import conftest @@ -60,6 +61,44 @@ def valid_language_fixture(): return newspaper.valid_languages() +@pytest.fixture +def language_text_fixture(): + return { + "en": { + "text": conftest.get_data("cnn_article", "txt"), + "stopwords": 638, + }, + "th": { + "text": conftest.get_data("thai_article", "txt"), + "stopwords": 98, + }, + "ar": { + "text": conftest.get_data("arabic_article", "txt"), + "stopwords": 87, + }, + "es": { + "text": conftest.get_data("spanish_article", "txt"), + "stopwords": 221, + }, + "zh": { + "text": conftest.get_data("chinese_article", "txt"), + "stopwords": 88, + }, + "ja": { + "text": conftest.get_data("japanese_article", "txt"), + "stopwords": 46, + }, + "ko": { + "text": conftest.get_data("korean_article", "txt"), + "stopwords": 122, + }, + # "hi": { + # "text": conftest.get_data("hindi_article", "txt"), + # "stopwords": 0, + # }, + } + + class TestLanguages: def test_error_unknown_language(self): with pytest.raises(ValueError): @@ -87,3 +126,17 @@ def test_full_extract(self, language_article_fixture): # TODO: test text_cleaned assert len(errors) == 0, f"Test failed for {errors}" + + def test_stopwords(self, language_text_fixture): + errors = [] + for lang, text in language_text_fixture.items(): + stopwords = StopWords(lang) + + stat = stopwords.get_stopword_count(text["text"]) + if stat.stop_word_count != text["stopwords"]: + errors.append( + f"Stopwords count for {lang} is {stat.stop_word_count} instead of" + f" {text['stopwords']}" + ) + + assert len(errors) == 0, "Errors in Stopwords: \n" + "\n".join(errors) diff --git a/tests/test_nlp.py b/tests/test_nlp.py index a75d9d4..c75dbeb 100755 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -1,5 +1,7 @@ import pytest import newspaper +from newspaper import nlp +from newspaper.text import StopWords from tests import conftest @@ -16,11 +18,50 @@ def cnn_article(): "url": url, "html_content": html_content, "text_content": text_content, + "title": ( + "Gates on Tillerson and Russia: 'You can be friendly without being friends'" + ), + "summary": [ + ( + "Former Defense Secretary Robert Gates on Sunday defended" + " President-elect Donald Trump\u2019s pick for secretary of state," + " ExxonMobil CEO Rex Tillerson, over his relationship with Russian" + " President Vladimir Putin." + ), + ( + "But being friendly doesn\u2019t make you friends,\u201d Gates said in" + " an interview on NBC\u2019s \u201cMeet the Press.\u201d Gates\u2019" + " comments come after he and former Secretary of State Condoleezza Rice" + " both recommended Tillerson for the job." + ), + ( + "Their recommendation has faced scrutiny since Gates and Rice both have" + " business ties to ExxonMobil through their consulting firm." + ), + ( + "Tillerson has faced criticism over his relationship with Putin and" + " Russia amid the intelligence community\u2019s finding that Russian" + " hackers stole Democratic emails in a bid to influence the US" + " election." + ), + ( + "But Gates said Tillerson\u2019s business relationship with Putin is" + " being mistaken for a close personal friendship." + ), + ], } +@pytest.fixture(scope="module") +def keywords_fixture(): + return dict( + text="The economy is going to be good. Have a good day. Day by day.", + keywords=["day", "good"], + ) + + class TestNLP: - def test_keywords(self, cnn_article): + def test_article_nlp(self, cnn_article): a = newspaper.article( cnn_article.get("url"), language="en", @@ -30,3 +71,21 @@ def test_keywords(self, cnn_article): a.nlp() assert len(a.keywords) == a.config.max_keywords + + def test_keywords(self, keywords_fixture): + text = keywords_fixture.get("text") + keywords = keywords_fixture.get("keywords") + stopwords = StopWords("en") + + keywords_ = nlp.keywords(text, stopwords, 2) + + assert list(keywords_.keys()) == keywords + + def test_summarize(self, cnn_article): + text = cnn_article.get("text_content") + title = cnn_article.get("title") + stopwords = StopWords("en") + + summary = nlp.summarize(title, text, stopwords) + + assert summary == cnn_article.get("summary")