diff --git a/README.md b/README.md index 1d75baa..b05834c 100755 --- a/README.md +++ b/README.md @@ -4,12 +4,15 @@ [![Coverage status](https://coveralls.io/repos/github/AndyTheFactory/newspaper4k/badge.svg?branch=master)](https://coveralls.io/github/AndyTheFactory/newspaper4k) [![Documentation Status](https://readthedocs.org/projects/newspaper4k/badge/?version=latest)](https://newspaper4k.readthedocs.io/en/latest/) -At the moment the Newspaper4k Project is a fork of the well known newspaper3k by [codelucas](https://github.com/codelucas/newspaper) which was not updated since Sept 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs. +At the moment the Newspaper4k Project is a fork of the well known newspaper3k by [codelucas](https://github.com/codelucas/newspaper) which was not updated since September 2020. The initial goal of this fork is to keep the project alive and to add new features and fix bugs. I have duplicated all issues on the original project and will try to fix them. If you have any issues or feature requests please open an issue here. -**Experimental ChatGPT helper bot for Newspaper4k:** -[![ChatGPT helper](docs/user_guide/assets/chatgpt_chat.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt) +| | | +|-------------|-------------| +| **Experimental ChatGPT helper bot for Newspaper4k:** | [![ChatGPT helper](docs/user_guide/assets/chatgpt_chat200x75.png)](https://chat.openai.com/g/g-OxSqyKAhi-newspaper-4k-gpt)| + + ## Python compatibility - Recommended: Python 3.8+ @@ -29,10 +32,10 @@ You can start directly from the command line, using the included CLI: python -m newspaper --url="https://edition.cnn.com/2023/11/17/success/job-seekers-use-ai/index.html" --language=en --output-format=json --output-file=article.json ``` - +More information about the CLI can be found in the [CLI documentation](https://newspaper4k.readthedocs.io/en/latest/user_guide/cli_reference.html). ## Using the Python API -Alternatively, you can use the Python API: +Alternatively, you can use Newspaper4k in Python: ### Processing one article / url at a time @@ -82,22 +85,22 @@ import newspaper cnn_paper = newspaper.build('http://cnn.com', number_threads=3) print(cnn_paper.category_urls()) -> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com', -> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com', -> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com'] +>> ['https://cnn.com', 'https://money.cnn.com', 'https://arabic.cnn.com', +>> 'https://cnnespanol.cnn.com', 'http://edition.cnn.com', +>> 'https://edition.cnn.com', 'https://us.cnn.com', 'https://www.cnn.com'] article_urls = [article.url for article in cnn_paper.articles] print(article_urls[:3]) -> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson', -> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations', -> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza'] +>> ['https://arabic.cnn.com/middle-east/article/2023/10/30/number-of-hostages-held-in-gaza-now-up-to-239-idf-spokesperson', +>> 'https://arabic.cnn.com/middle-east/video/2023/10/30/v146619-sotu-sullivan-hostage-negotiations', +>> 'https://arabic.cnn.com/middle-east/article/2023/10/29/norwegian-pm-israel-gaza'] article = cnn_paper.articles[0] article.download() article.parse() print(article.title) -> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى +>> المتحدث باسم الجيش الإسرائيلي: عدد الرهائن المحتجزين في غزة يصل إلى ``` Or if you want to get bulk articles from the website (have in mind that this could take a long time and could get your IP blocked by the newssite): @@ -130,7 +133,7 @@ article.download() article.parse() print(article.title) -> 晶片大战:台湾厂商助攻华为突破美国封锁? +>> 晶片大战:台湾厂商助攻华为突破美国封锁? if article.config.use_meta_language: # If we use the autodetected language, this config attribute will be true @@ -138,7 +141,7 @@ if article.config.use_meta_language: else: print(article.config.language) -> zh +>> zh ``` # Docs @@ -158,8 +161,25 @@ detailed guides using newspaper. - Autoatic article text summarization - Author extraction from text - Easy to use Command Line Interface (`python -m newspaper....`) +- Output in various formats (json, csv, text) - Works in 10+ languages (English, Chinese, German, Arabic, \...) +# Evaluation + +## Evaluation Results + + +Using the dataset from [ScrapingHub](https://github.com/scrapinghub/article-extraction-benchmark) I created an [evaluator script](tests/evaluation/evaluate.py) that compares the performance of newspaper against it's previous versions. This way we can see how newspaper updates improve or worsen the performance of the library. + +| Version | Corpus BLEU Score | Corpus Precision Score | Corpus Recall Score | Corpus F1 Score | +|--------------------|-------------------|------------------------|---------------------|-----------------| +| Newspaper3k 0.2.8 | 0.8660 | 0.9128 | 0.9071 | 0.9100 | +| Newspaper4k 0.9.0 | 0.9212 | 0.8992 | 0.9336 | 0.9161 | +| Newspaper4k 0.9.1 | 0.9224 | 0.8895 | 0.9242 | 0.9065 | +| Newspaper4k 0.9.2 | 0.9426 | 0.9070 | 0.9087 | 0.9078 | + +Precision, Recall and F1 are computed using overlap of shingles with n-grams of size 4. The corpus BLEU score is computed using the [nltk's bleu_score](https://www.nltk.org/api/nltk.translate.bleu). + # Requirements and dependencies Following system packages are required: diff --git a/docs/user_guide/advanced.rst b/docs/user_guide/advanced.rst index bbc2a83..d224332 100755 --- a/docs/user_guide/advanced.rst +++ b/docs/user_guide/advanced.rst @@ -11,7 +11,7 @@ Multi-threading article downloads **Downloading articles one at a time is slow.** But spamming a single news source like cnn.com with tons of threads or with ASYNC-IO will cause rate limiting -and also doing that is very mean. +and also doing that can lead to your ip to be blocked by the site. We solve this problem by allocating 1-2 threads per news source to both greatly speed up the download time while being respectful. @@ -19,22 +19,50 @@ speed up the download time while being respectful. .. code-block:: python import newspaper - from newspaper import news_pool + from newspaper.mthreading import fetch_news slate_paper = newspaper.build('http://slate.com') tc_paper = newspaper.build('http://techcrunch.com') espn_paper = newspaper.build('http://espn.com') papers = [slate_paper, tc_paper, espn_paper] - news_pool.set(papers, threads_per_source=2) # (3*2) = 6 threads total - news_pool.join() + results = fetch_news(papers, threads=4) + #At this point, you can safely assume that download() has been #called on every single article for all 3 sources. - print(slate_paper.articles[10].html) + print(slate_paper.articles[10].tite) #' ...' + +In addition to :any:`Source` objects, :any:`fetch_news` also accepts :any:`Article` objects or simple urls. + +.. code-block:: python + + article_urls = [f'https://abcnews.go.com/US/x/story?id={i}' for i in range(106379500, 106379520)] + articles = [Article(url=u) for u in article_urls] + + results = fetch_news(articles, threads=4) + + urls = [ + "https://www.foxnews.com/media/homeowner-new-florida-bill-close-squatting-loophole-return-some-fairness", + "https://edition.cnn.com/2023/12/27/middleeast/dutch-diplomat-humanitarian-aid-gaza-sigrid-kaag-intl/index.html", + ] + + results = fetch_news(urls, threads=4) + + # or everything at once + papers = [slate_paper, tc_paper, espn_paper] + papers.extend(articles) + papers.extend(urls) + + results = fetch_news(papers, threads=4) + + +**Note:** in previous versions of newspaper, this could be done with the ``news_pool`` call, but it was not very robust +and was replaced with a ThreadPoolExecutor implementation. + Keeping just the Html of the main body article ------------------------------------------------ @@ -191,12 +219,84 @@ The full available options are available under the :any:`Configuration` section Caching ------- -TODO +The Newspaper4k library provides a simple caching mechanism that can be used to avoid repeatedly downloading the same article. Additionally, when building an :any:`Source` object, the category url detection is cached for 24 hours. + +Both mechanisms are enabled by default. The article caching is controlled by the ``memoize_articles`` parameter in the :any:`newspaper.build()` function or, alternatively, when creating an :any:`Source` object, the ``memoize_articles`` parameter in the constructor. Setting it to ``False`` will disable the caching mechanism. + +The category detection caching is controlled by `utils.cache_disk.enabled` setting. This disables the caching decorator on the ``Source._get_category_urls(..)`` method. + +For example: + +.. code-block:: python + + import newspaper + from newspaper import utils + + cbs_paper = newspaper.build('http://cbs.com') + + # Disable article caching + utils.cache_disk.enabled = False + + cbs_paper2 = newspaper.build('http://cbs.com') # The categories will be re-detected + + # Enable article caching + utils.cache_disk.enabled = True + + cbs_paper3 = newspaper.build('http://cbs.com') # The cached category urls will be loaded + + Proxy Usage -------------- -TODO +Often times websites block repeated access from a single IP address. Or, some websites might limit access from certain geographic locations (due to legal reasons, etc.). To bypass these restrictions, you can use a proxy. Newspaper supports using a proxy by passing the ``proxies`` parameter to the :any:`Article` object's constructor or :any:`Source` object's constructor. The ``proxies`` parameter should be a dictionary, as required by the ``requests library``, with the following format: + +.. code-block:: python + + from newspaper import Article + + # Define your proxy + proxies = { + 'http': 'http://your_http_proxy:port', + 'https': 'https://your_https_proxy:port' + } + + # URL of the article you want to scrape + url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667' + + # Create an Article object, passing the proxies parameter + article = Article(url, proxies=proxies) + + # Download and parse the article + article.download() + article.parse() + + # Access the article's text, keywords, and summary + print("Title:", article.title) + print("Text:", article.text) + +or the shorter version: + +.. code-block:: python + + from newspaper import article + + # Define your proxy + proxies = { + 'http': 'http://your_http_proxy:port', + 'https': 'https://your_https_proxy:port' + } + + # URL of the article you want to scrape + url = 'https://abcnews.go.com/Technology/wireStory/indonesias-mount-marapi-erupts-leading-evacuations-reported-casualties-106358667' + + # Create an Article object, + article = article(url, proxies=proxies) + + # Access the article's text, keywords, and summary + print("Title:", article.title) + print("Text:", article.text) + Cookie Usage (simulate logged in user) -------------------------------------- diff --git a/docs/user_guide/api_reference.rst b/docs/user_guide/api_reference.rst index 4f7d7da..13b39f0 100755 --- a/docs/user_guide/api_reference.rst +++ b/docs/user_guide/api_reference.rst @@ -6,6 +6,20 @@ Newspaper API .. autosummary:: :toctree: generated +Function calls +-------------- + +.. autofunction:: newspaper.article + +.. autofunction:: newspaper.build + +.. autofunction:: newspaper.mthreading.fetch_news + +.. autofunction:: newspaper.hot + +.. autofunction:: newspaper.languages + + Configuration ------------- @@ -44,7 +58,9 @@ Source .. automethod:: newspaper.Source.purge_articles() .. automethod:: newspaper.Source.feeds_to_articles() .. automethod:: newspaper.Source.categories_to_articles() +.. automethod:: newspaper.Source.generate_articles() .. automethod:: newspaper.Source.download_articles() +.. automethod:: newspaper.Source.download() .. automethod:: newspaper.Source.size() Category @@ -55,3 +71,10 @@ Category Feed ---- .. autoclass:: newspaper.source.Feed + + +Exceptions +---------- +.. autoclass:: newspaper.ArticleException + +.. autoclass:: newspaper.ArticleBinaryDataException diff --git a/docs/user_guide/assets/chatgpt_chat200x75.png b/docs/user_guide/assets/chatgpt_chat200x75.png new file mode 100644 index 0000000..3eac304 Binary files /dev/null and b/docs/user_guide/assets/chatgpt_chat200x75.png differ diff --git a/docs/user_guide/assets/chatgpt_chat.png b/docs/user_guide/assets/chatgpt_chat75x75.png similarity index 100% rename from docs/user_guide/assets/chatgpt_chat.png rename to docs/user_guide/assets/chatgpt_chat75x75.png diff --git a/docs/user_guide/examples.rst b/docs/user_guide/examples.rst index 90343f0..68aa4f5 100755 --- a/docs/user_guide/examples.rst +++ b/docs/user_guide/examples.rst @@ -3,20 +3,214 @@ Examples and Tutorials ====================== -Building and Crawling a News Source ------------------------------------ +1. Building and Crawling a News Sources using a Multithreaded approach +---------------------------------------------------------------------- +Building and crawling news websites can require the handling of multiple sources simultaneously and processing a large volume of articles. You can singnificantly improve the performance of this process by using multiple threads when crawling. Even if Python is not truly multithreaded (due to the GIL), i/o requests can be handled in parallel. -Getting Articles with Scrapy ----------------------------- +.. code-block:: python + from newspaper import Source + from newspaper.mthreading import fetch_news + import threading -Using Playwright to Scrape Websites built with Javascript ---------------------------------------------------------- + class NewsCrawler: + def __init__(self, source_urls, config=None): + self.sources = [Source(url, config=config) for url in source_urls] + self.articles = [] + def build_sources(self): + # Multithreaded source building + threads = [threading.Thread(target=source.build) for source in self.sources] + for thread in threads: + thread.start() + for thread in threads: + thread.join() -Using Playwright to Scrape Websites that require login ------------------------------------------------------- + def crawl_articles(self): + # Multithreaded article downloading + self.articles = fetch_news(self.sources, threads=4) + + def extract_information(self): + # Extract information from each article + for source in self.sources: + print(f"Source {source.url}") + for article in source.articles[:10]: + article.parse() + print(f"Title: {article.title}") + print(f"Authors: {article.authors}") + print(f"Text: {article.text[:150]}...") # Printing first 150 characters of text + print("-------------------------------") + + if __name__ == "__main__": + source_urls = ['https://slate.com', 'https://time.com'] # Add your news source URLs here + crawler = NewsCrawler(source_urls) + crawler.build_sources() + crawler.crawl_articles() + crawler.extract_information() + + +2. Getting Articles with Scrapy +-------------------------------- + +Install Necessary Packages +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + pip install scrapy + pip install newspaper4k + +Create the scrapy project: + +.. code-block:: bash + + scrapy startproject news_scraper + +This command creates a new folder news_scraper with the necessary Scrapy files. + + +Code the Scrapy Spider +^^^^^^^^^^^^^^^^^^^^^^ +Navigate to the news_scraper/spiders folder and create a new spider. For example, news_spider.py: + + .. code-block:: python + + import scrapy + import newspaper + + class NewsSpider(scrapy.Spider): + name = 'news' + start_urls = ['https://abcnews.go.com/elections'] # Replace with your target URLs + + def parse(self, response): + # Extract URLs from the response and yield Scrapy Requests + for href in response.css('a::attr(href)'): + yield response.follow(href, self.parse_article) + + def parse_article(self, response): + # Use Newspaper4k to parse the article + article = newspaper.article(response.url, language='en', input_html=response.text) + article.parse() + article.nlp() + + # Extracted information + yield { + 'url': response.url, + 'title': article.title, + 'authors': article.authors, + 'text': article.text, + 'publish_date': article.publish_date, + 'keywords': article.keywords, + 'summary': article.summary, + } + + +Run the Spider +^^^^^^^^^^^^^^ + +.. code-block:: bash + + scrapy crawl news -o output.json + + +3. Using Playwright to Scrape Websites built with Javascript +------------------------------------------------------------- + +Install Necessary Packages +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + pip install newspaper4k + pip install playwright + playwright install + +Scrape with Playwright +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. code-block:: python + + from playwright.sync_api import sync_playwright + import newspaper + import time + + def scrape_with_playwright(url): + # Using Playwright to render JavaScript + with sync_playwright() as p: + browser = p.chromium.launch() + page = browser.new_page() + page.goto(url) + time.sleep(1) # Allow the javascript to render + content = page.content() + browser.close() + + # Using Newspaper4k to parse the page content + article = newspaper.article(url, input_html=content, language='en') + + return article + + # Example URL + url = 'https://ec.europa.eu/commission/presscorner/detail/en/ac_24_84' # Replace with the URL of your choice + + # Scrape and process the article + article = scrape_with_playwright(url) + article.nlp() + + print(f"Title: {article.title}") + print(f"Authors: {article.authors}") + print(f"Publication Date: {article.publish_date}") + print(f"Summary: {article.summary}") + print(f"Keywords: {article.keywords}") + + +4. Using Playwright to Scrape Websites that require login +---------------------------------------------------------- + + +.. code-block:: python + + from playwright.sync_api import sync_playwright + import newspaper + + def login_and_fetch_article(url, login_url, username, password): + # Using Playwright to handle login and fetch article + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) # Set headless=False to watch the browser actions + page = browser.new_page() + + # Automating login + page.goto(login_url) + page.fill('input[name="log"]', username) # Adjust the selector as per the site's HTML + page.fill('input[name="pwd"]', password) # Adjust the selector as per the site's HTML + page.click('input[type="submit"][value="Login"]') # Adjust the selector as per the site's HTML + + # Wait for navigation after login + page.wait_for_url('/') + # Navigating to the article + page.goto(url) + content = page.content() + browser.close() + + # Using Newspaper4k to parse the page content + article = newspaper.article(url, input_html=content, language='en') + + return article + + # Example URLs and credentials + login_url = 'https://www.undercurrentnews.com/login/' # Replace with the actual login URL + article_url = 'https://www.undercurrentnews.com/2024/01/08/editors-choice-farmed-shrimp-output-to-drop-in-2024-fallout-from-us-expanded-russia-ban/' # Replace with the URL of the article you want to scrape + username = 'tester_news' # Replace with your username + password = 'test' # Replace with your password + + # Fetch and process the article + article = login_and_fetch_article(article_url, login_url, username, password) + article.nlp() + print(f"Title: {article.title}") + print(f"Authors: {article.authors}") + print(f"Publication Date: {article.publish_date}") + print(f"Summary: {article.summary}") + print(f"Keywords: {article.keywords}") diff --git a/newspaper/api.py b/newspaper/api.py index 5aaadc5..4e16eb6 100755 --- a/newspaper/api.py +++ b/newspaper/api.py @@ -3,6 +3,7 @@ # Copyright (c) Lucas Ou-Yang (codelucas) +from typing import List import feedparser from .article import Article @@ -14,8 +15,22 @@ def build(url="", dry=False, config=None, **kwargs) -> Source: - """Returns a constructed source object without + """Returns a constructed :any:`Source` object without downloading or parsing the articles + + Args: + url (str): The url of the source (news website) to build. For example, + `https://www.cnn.com`. + dry (bool): If true, the source object will be constructed but not + downloaded or parsed. + config (Configuration): A configuration object to use for the source. + kwargs: Any other keyword arguments to pass to the Source constructor. + If you omit the config object, you can add any configuration + options here. + + Returns: + Source: The constructed :any:`Source` object. + """ config = config or Configuration() config.update(**kwargs) @@ -40,11 +55,11 @@ def build_article(url="", config=None, **kwargs) -> Article: def languages(): - """Returns a list of the supported languages""" + """Prints a list of the supported languages""" print_available_languages() -def popular_urls(): +def popular_urls() -> List[str]: """Returns a list of pre-extracted popular source urls""" with open(POPULAR_URLS, encoding="utf-8") as f: urls = ["http://" + u.strip() for u in f.readlines()] diff --git a/newspaper/article.py b/newspaper/article.py index 22da870..89f7e86 100755 --- a/newspaper/article.py +++ b/newspaper/article.py @@ -140,9 +140,9 @@ class Article: def __init__( self, url: str, - title: str = "", - source_url: str = "", - read_more_link: str = "", + title: Optional[str] = "", + source_url: Optional[str] = "", + read_more_link: Optional[str] = "", config: Optional[Configuration] = None, **kwargs: Dict[str, Any], ): diff --git a/newspaper/configuration.py b/newspaper/configuration.py index c882d57..2dba63e 100755 --- a/newspaper/configuration.py +++ b/newspaper/configuration.py @@ -31,7 +31,9 @@ class Configuration: """Modifies Article / Source properties. + Attributes: + min_word_count (int): minimum number of word tokens in an article text min_sent_count (int): minimum number of sentences in an article text max_title (int): :any:`Article.title` max number of chars. ``title`` @@ -60,9 +62,9 @@ class Configuration: memorize_articles (bool): If True, it will cache and save articles run between runs. The articles are *NOT* cached. It will save the parsed article urls between different - `Source`.`generate_articles()` runs. default True. - disable_category_cache (bool): If True, it will not cache the `Source` - category urls. default False. + :any:`Source.generate_articles()` runs. default True. + disable_category_cache (bool): If True, it will not cache + the :any:`Source` category urls. default False. fetch_images (bool): If False, it will not download images to verify if they obide by the settings in top_image_settings. default True. @@ -72,7 +74,7 @@ class Configuration: from the article body html. Affected property is :any:`Article.article_html`. Default True. - http_success_only (bool): if True, it will raise an ``ArticleException`` + http_success_only (bool): if True, it will raise an :any:`ArticleException` if the html status_code is >= 400 (e.g. 404 page). default True. stopwords_class (obj): unique stopword classes for oriental languages, don't toggle @@ -88,13 +90,13 @@ class Configuration: and could hang the process due to huge binary files (such as movies) default False. ignored_content_types_defaults (dict): dictionary of content-types - and a default stub content. - These content type will not be downloaded. - **Note:** - If `allow_binary_content` is False, - binary content will lead to `ArticleBinaryDataException` for - `Article.download()` and will be skipped in `Source.build()`. This - will override the defaults in :any:`ignored_content_types_defaults` + and a default stub content. These content type will not be downloaded. + + **Note:** If :any:`allow_binary_content` is False, + binary content will lead to :any:`ArticleBinaryDataException` for + :any:`Article.download()` and will be skipped in + :any:`Source.build()`. This will override the defaults + in :any:`ignored_content_types_defaults` if these match binary files. use_cached_categories (bool): if set to False, the cached categories will be ignored and a the :any:`Source` will recompute the category @@ -206,8 +208,9 @@ def __init__(self): def update(self, **kwargs): """Update the configuration object with the given keyword arguments. + Arguments: - **kwargs: The keyword arguments to update. + **kwargs: The keyword arguments to update. """ for key, value in kwargs.items(): @@ -292,6 +295,7 @@ def language(self, value: str): def use_meta_language(self): """Read-only property that indicates whether the meta language read from the website was used or the language was explicitly set. + Returns: bool: True if the meta language was used, False if the language was explicitly set. diff --git a/newspaper/exceptions.py b/newspaper/exceptions.py index 1893fce..66e57d0 100755 --- a/newspaper/exceptions.py +++ b/newspaper/exceptions.py @@ -4,7 +4,7 @@ class ArticleBinaryDataException(Exception): """Exception raised for binary data in urls. - will be raised if allow_binary_content is False. + will be raised if :any:`Configuration.allow_binary_content` is False. """ diff --git a/newspaper/mthreading.py b/newspaper/mthreading.py index e846e8b..db7fb33 100755 --- a/newspaper/mthreading.py +++ b/newspaper/mthreading.py @@ -21,17 +21,19 @@ def fetch_news( If there is a problem in detecting the language of the urls, then instantiate the `Article` object yourself with the language parameter and pass it in. - Arguments: - news_list {List[Union[str, Article, Source]]} -- List of sources, - articles, urls or a mix of them. - - threads {int} -- Number of threads to use for fetching. This affects - how many items from the news_list are fetched at once. In order to control - how many threads are used in a `Source` object, use the - `Configuration`.`number_threads` setting. This could result in - a high number of threads. Maximum number of threads would be - `threads` * `Configuration`.`number_threads`. - + Args: + news_list(List[Union[str, Article, Source]]): List of sources, + articles, urls or a mix of them. + + threads(int): Number of threads to use for fetching. This affects + how many items from the news_list are fetched at once. In order to + control + how many threads are used in a `Source` object, use the + `Configuration`.`number_threads` setting. This could result in + a high number of threads. Maximum number of threads would be + `threads` * `Configuration`.`number_threads`. + Returns: + List[Union[Article, Source]]: List of articles or sources. """ def get_item(item: Union[str, Article, Source]) -> Union[Article, Source]: diff --git a/newspaper/source.py b/newspaper/source.py index c76386c..10080cb 100755 --- a/newspaper/source.py +++ b/newspaper/source.py @@ -244,7 +244,7 @@ def set_description(self): self.description = metadata["description"] def download(self): - """Downloads html of source""" + """Downloads html of source, i.e. the news site homppage""" self.html = network.get_html(self.url, self.config) def download_categories(self): @@ -408,14 +408,23 @@ def _generate_articles(self): return list(uniq.values()) def generate_articles(self, limit=5000): - """Saves all current articles of news source, filter out bad urls""" + """Creates the :any:`Source.articles` List of :any:`Article` objects. + It gets the Urls from all detected categories and RSS feeds, checks + them for plausibility based on their URL (using some heuristics defined + in the ``urls.valid_url`` function). These can be further + downloaded using :any:`Source.download_articles()` + + Args: + limit (int, optional): The maximum number of articles to generate. + Defaults to 5000. + """ articles = self._generate_articles() self.articles = articles[:limit] log.debug("%d articles generated and cutoff at %d", len(articles), limit) def download_articles(self) -> List[Article]: """Starts the ``download()`` for all :any:`Article` objects - from the ``articles`` property. It can run single threaded or + in the :any:`Source.articles` property. It can run single threaded or multi-threaded. Returns: List[:any:`Article`]: A list of downloaded articles. diff --git a/tests/test_source.py b/tests/test_source.py index 3d2a75a..c7affff 100755 --- a/tests/test_source.py +++ b/tests/test_source.py @@ -110,6 +110,8 @@ def test_empty_url_source(self): with pytest.raises(ValueError): Source(url=None) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_build_source(self, cnn_source): source = Source(cnn_source["url"], verbose=False, memorize_articles=False) source.clean_memo_cache() @@ -130,6 +132,8 @@ def test_build_source(self, cnn_source): # assert sorted(source.category_urls()) == sorted(cnn_source["category_urls"]) # assert sorted(source.feed_urls()) == sorted(cnn_source["feeds"]) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_memorize_articles(self, cnn_source): source = Source(cnn_source["url"], verbose=False, memorize_articles=True) source.clean_memo_cache() @@ -184,6 +188,8 @@ def stub_func(_, domain): with pytest.raises(Exception): stub_func(None, source.domain) + # Skip if GITHUB_ACTIONS. It can fail because of internet access + @pytest.mark.skipif("GITHUB_ACTIONS" in os.environ, reason="Skip if GITHUB_ACTIONS") def test_get_feeds(self, feed_sources): for feed_source in feed_sources: source = Source(feed_source["url"])